diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000000..033b1cbfe576 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,88 @@ +# When making changes, verify the output of: +# clang-tidy -list-checks +--- +Checks: "-*,\ + bugprone-argument-comment,\ + bugprone-dangling-handle,\ + bugprone-fold-init-type,\ + bugprone-forward-declaration-namespace,\ + bugprone-forwarding-reference-overload,\ + bugprone-shadow,\ + bugprone-sizeof-*,\ + bugprone-string-constructor,\ + bugprone-undefined-memory-manipulation,\ + bugprone-unused-return-value,\ + bugprone-use-after-move,\ + cert-env33-c,\ + cert-err58-cpp,\ + cert-msc30-c,\ + cert-msc50-cpp,\ + clang-analyzer-core.NullDereference,\ + clang-analyzer-core.StackAddressEscape,\ + clang-analyzer-deadcode.DeadStores,\ + clang-diagnostic-*,\ + -clang-diagnostic-missing-designated-field-initializers,\ + concurrency-mt-unsafe,\ + cppcoreguidelines-avoid-non-const-global-variables,\ + cppcoreguidelines-missing-std-forward,\ + cppcoreguidelines-pro-type-member-init,\ + cppcoreguidelines-special-member-functions,\ + cppcoreguidelines-virtual-class-destructor,\ + google-build-using-namespace,\ + google-explicit-constructor,\ + google-readability-avoid-underscore-in-googletest-name,\ + misc-definitions-in-headers,\ + misc-redundant-expression,\ + modernize-make-shared,\ + modernize-use-emplace,\ + modernize-use-noexcept,\ + modernize-use-override,\ + modernize-use-using,\ + performance-faster-string-find,\ + performance-for-range-copy,\ + performance-implicit-conversion-in-loop,\ + performance-inefficient-algorithm,\ + performance-inefficient-string-concatenation,\ + performance-inefficient-vector-operation,\ + performance-move-const-arg,\ + performance-move-constructor-init,\ + performance-no-automatic-move,\ + performance-no-int-to-ptr,\ + performance-noexcept-move-constructor,\ + performance-noexcept-swap,\ + performance-trivially-destructible,\ + performance-type-promotion-in-math-fn,\ + performance-unnecessary-copy-initialization,\ + performance-unnecessary-value-param,\ + readability-braces-around-statements,\ + readability-duplicate-include,\ + readability-isolate-declaration,\ + readability-operators-representation,\ + readability-redundant-string-init" + +WarningsAsErrors: "bugprone-use-after-move" + +CheckOptions: +- key: bugprone-easily-swappable-parameters.MinimumLength + value: 4 +- key: cppcoreguidelines-avoid-non-const-global-variables.AllowThreadLocal + value: true +- key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor + value: true +- key: cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove + value: true +- key: modernize-use-using.IgnoreExternC + value: true +- key: performance-move-const-arg.CheckTriviallyCopyableMove + value: false +- key: performance-unnecessary-value-param.AllowedTypes + value: '[Pp]ointer$;[Pp]tr$;[Rr]ef(erence)?$' +- key: performance-unnecessary-copy-initialization.AllowedTypes + value: '[Pp]ointer$;[Pp]tr$;[Rr]ef(erence)?$' +- key: readability-operators-representation.BinaryOperators + value: '&&;&=;&;|;~;!;!=;||;|=;^;^=' +- key: readability-redundant-string-init.StringNames + value: '::std::basic_string' +- key: readability-named-parameter.InsertPlainNamesInForwardDecls + value: true +... diff --git a/.github/actions/build-folly/action.yml b/.github/actions/build-folly/action.yml index 70229199958b..84f99de18d25 100644 --- a/.github/actions/build-folly/action.yml +++ b/.github/actions/build-folly/action.yml @@ -1,7 +1,17 @@ name: build-folly +description: Build folly and dependencies (skipped if cache hit) +inputs: + cache-hit: + description: Whether the folly cache was hit + required: true runs: using: composite steps: - name: Build folly and dependencies + if: ${{ inputs.cache-hit != 'true' }} run: make build_folly shell: bash + - name: Skip folly build (using cached version) + if: ${{ inputs.cache-hit == 'true' }} + run: echo "Folly build skipped - using cached version" + shell: bash diff --git a/.github/actions/cache-folly/action.yml b/.github/actions/cache-folly/action.yml new file mode 100644 index 000000000000..f54a5a9a5a2e --- /dev/null +++ b/.github/actions/cache-folly/action.yml @@ -0,0 +1,33 @@ +name: cache-folly +description: Cache folly build to speed up CI +outputs: + cache-hit: + description: Whether the cache was hit + value: ${{ steps.cache-folly-build.outputs.cache-hit }} +runs: + using: composite + steps: + - name: Extract FOLLY_MK_HASH + id: extract-folly-hash + shell: bash + run: | + FOLLY_MK_HASH=$(md5sum folly.mk | cut -d' ' -f1) + echo "hash=$FOLLY_MK_HASH" >> $GITHUB_OUTPUT + - name: Extract FOLLY_INSTALL_DIR + id: extract-folly-install-dir + shell: bash + run: | + FOLLY_INSTALL_DIR=$(cd third-party/folly && python3 build/fbcode_builder/getdeps.py show-inst-dir) + echo "dir=$(echo $FOLLY_INSTALL_DIR | sed 's|installed/folly|installed|')" >> $GITHUB_OUTPUT + - name: Cache folly build + id: cache-folly-build + uses: actions/cache@v4 + with: + # Cache the folly build directory + path: ${{ steps.extract-folly-install-dir.outputs.dir }} + # Key is based on: + # - OS and architecture + # - The docker image, which may not always be specified/known + # - Hash of folly.mk, which includes the folly repository commit hash + # NOTE: this is still only intended for DEBUG folly builds + key: folly-build-${{ runner.os }}-${{ runner.arch }}-${{ github.job_container.image }}-${{ steps.extract-folly-hash.outputs.hash }} diff --git a/.github/actions/cache-getdeps-downloads/action.yml b/.github/actions/cache-getdeps-downloads/action.yml new file mode 100644 index 000000000000..ca871bf1c8cd --- /dev/null +++ b/.github/actions/cache-getdeps-downloads/action.yml @@ -0,0 +1,21 @@ +name: cache-getdeps-downloads +description: Cache getdeps downloads to avoid unreliable mirrors and speed up builds +outputs: + cache-hit: + description: Whether the cache was hit + value: ${{ steps.cache-downloads.outputs.cache-hit }} +runs: + using: composite + steps: + - name: Cache getdeps downloads + id: cache-downloads + uses: actions/cache@v4 + with: + # Use a fixed path that we control - folly.mk will sync with getdeps downloads dir + path: /tmp/rocksdb-getdeps-cache + # Use a rolling cache key - the cache accumulates downloads over time + # The key includes a weekly timestamp to ensure periodic refresh + key: getdeps-downloads-${{ runner.os }}-${{ runner.arch }}-week-${{ github.run_id }} + restore-keys: | + getdeps-downloads-${{ runner.os }}-${{ runner.arch }}-week- + getdeps-downloads-${{ runner.os }}-${{ runner.arch }}- diff --git a/.github/actions/install-maven/action.yml b/.github/actions/install-maven/action.yml index 69a925272ac1..815ec751f2de 100644 --- a/.github/actions/install-maven/action.yml +++ b/.github/actions/install-maven/action.yml @@ -4,8 +4,8 @@ runs: steps: - name: Install Maven run: | - wget --no-check-certificate https://dlcdn.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz - tar zxf apache-maven-3.9.6-bin.tar.gz - echo "export M2_HOME=$(pwd)/apache-maven-3.9.6" >> $GITHUB_ENV - echo "$(pwd)/apache-maven-3.9.6/bin" >> $GITHUB_PATH + wget --no-check-certificate https://archive.apache.org/dist/maven/maven-3/3.9.11/binaries/apache-maven-3.9.11-bin.tar.gz + tar zxf apache-maven-3.9.11-bin.tar.gz + echo "export M2_HOME=$(pwd)/apache-maven-3.9.11" >> $GITHUB_ENV + echo "$(pwd)/apache-maven-3.9.11/bin" >> $GITHUB_PATH shell: bash diff --git a/.github/actions/setup-folly/action.yml b/.github/actions/setup-folly/action.yml index 41cec847ce60..8702b92aa857 100644 --- a/.github/actions/setup-folly/action.yml +++ b/.github/actions/setup-folly/action.yml @@ -3,5 +3,9 @@ runs: using: composite steps: - name: Checkout folly sources - run: make checkout_folly + run: | + make checkout_folly + shell: bash + - name: Install patchelf and libaio + run: apt-get update -y && apt-get install -y patchelf libaio-dev shell: bash diff --git a/.github/actions/windows-build-steps/action.yml b/.github/actions/windows-build-steps/action.yml index 9213f2e828fc..699d4aa0e580 100644 --- a/.github/actions/windows-build-steps/action.yml +++ b/.github/actions/windows-build-steps/action.yml @@ -4,6 +4,16 @@ runs: steps: - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.3.1 + - name: Cache ccache directory + id: ccache-cache + uses: actions/cache@v4 + with: + path: C:\a\rocksdb\rocksdb\.ccache + key: rocksdb-build-${{ runner.os }}-${{ runner.arch }}-ccache-${{ hashFiles('CMakeLists.txt', 'cmake/**/*.cmake') }}-v1 + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + max-size: "10GB" - name: Custom steps env: THIRDPARTY_HOME: ${{ github.workspace }}/thirdparty @@ -11,9 +21,9 @@ runs: CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe CTEST_BIN: C:/Program Files/CMake/bin/ctest.exe JAVA_HOME: C:/Program Files/BellSoft/LibericaJDK-8 - SNAPPY_HOME: ${{ github.workspace }}/thirdparty/snappy-1.1.8 - SNAPPY_INCLUDE: ${{ github.workspace }}/thirdparty/snappy-1.1.8;${{ github.workspace }}/thirdparty/snappy-1.1.8/build - SNAPPY_LIB_DEBUG: ${{ github.workspace }}/thirdparty/snappy-1.1.8/build/Debug/snappy.lib + SNAPPY_HOME: ${{ github.workspace }}/thirdparty/snappy-1.2.2 + SNAPPY_INCLUDE: ${{ github.workspace }}/thirdparty/snappy-1.2.2;${{ github.workspace }}/thirdparty/snappy-1.2.2/build + SNAPPY_LIB_DEBUG: ${{ github.workspace }}/thirdparty/snappy-1.2.2/build/Debug/snappy.lib run: |- # NOTE: if ... Exit $LASTEXITCODE lines needed to exit and report failure echo ===================== Install Dependencies ===================== @@ -22,14 +32,14 @@ runs: mkdir $Env:THIRDPARTY_HOME cd $Env:THIRDPARTY_HOME echo "Building Snappy dependency..." - curl -Lo snappy-1.1.8.zip https://github.com/google/snappy/archive/refs/tags/1.1.8.zip + curl -Lo snappy-1.2.2.zip https://github.com/google/snappy/archive/refs/tags/1.2.2.zip if(!$?) { Exit $LASTEXITCODE } - unzip -q snappy-1.1.8.zip + unzip -q snappy-1.2.2.zip if(!$?) { Exit $LASTEXITCODE } - cd snappy-1.1.8 + cd snappy-1.2.2 mkdir build cd build - & cmake -G "$Env:CMAKE_GENERATOR" .. + & cmake -G "$Env:CMAKE_GENERATOR" .. -DSNAPPY_BUILD_TESTS=OFF -DSNAPPY_BUILD_BENCHMARKS=OFF if(!$?) { Exit $LASTEXITCODE } msbuild Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 if(!$?) { Exit $LASTEXITCODE } @@ -38,11 +48,12 @@ runs: $env:Path = $env:JAVA_HOME + ";" + $env:Path mkdir build cd build - & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DJNI=1 .. + & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DWIN_CI=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DXPRESS=1 -DJNI=1 .. if(!$?) { Exit $LASTEXITCODE } cd .. echo "Building with VS version: $Env:CMAKE_GENERATOR" - msbuild build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 + # use more parallel processes than the number of processes available, as most of the compile command would be cache hit + msbuild build/rocksdb.sln /m:32 /p:LinkIncremental=false -property:Configuration=Debug -property:Platform=x64 if(!$?) { Exit $LASTEXITCODE } echo ========================= Test RocksDB ========================= build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 @@ -52,3 +63,7 @@ runs: & ctest -C Debug -j 16 if(!$?) { Exit $LASTEXITCODE } shell: pwsh + - name: Show ccache stats + shell: pwsh + run: | + ccache --show-stats -v diff --git a/.github/workflows/clang-tidy-comment.yml b/.github/workflows/clang-tidy-comment.yml new file mode 100644 index 000000000000..9615c890f85f --- /dev/null +++ b/.github/workflows/clang-tidy-comment.yml @@ -0,0 +1,105 @@ +name: clang-tidy +on: + push: + pull_request_target: + types: [opened, synchronize, reopened] + +permissions: + pull-requests: write + +jobs: + clang-tidy: + if: github.repository_owner == 'facebook' + runs-on: + labels: 4-core-ubuntu + container: + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 + steps: + - uses: actions/checkout@v4.1.0 + with: + ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || github.sha }} + - name: Mark workspace as safe for git + run: git config --global --add safe.directory $GITHUB_WORKSPACE + - name: Determine diff base + id: diff-base + run: | + if [ "${{ github.event_name }}" = "pull_request_target" ]; then + BASE="${{ github.event.pull_request.base.sha }}" + else + BASE="${{ github.event.before }}" + fi + if [ -z "$BASE" ] || echo "$BASE" | grep -q '^0\{40\}$'; then + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "No valid diff base; skipping clang-tidy." + else + git fetch --depth=1 origin "$BASE" + echo "ref=$BASE" >> "$GITHUB_OUTPUT" + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + - name: Install clang-tidy + if: steps.diff-base.outputs.skip != 'true' + run: apt-get update && apt-get install -y clang-tidy + - name: Generate compile_commands.json + if: steps.diff-base.outputs.skip != 'true' + run: | + mkdir build && cd build + cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DCMAKE_C_COMPILER=clang-18 \ + -DCMAKE_CXX_COMPILER=clang++-18 .. + cd .. + ln -sf build/compile_commands.json compile_commands.json + - name: Run clang-tidy on changed files + id: clang-tidy + if: steps.diff-base.outputs.skip != 'true' + run: | + python3 tools/run_clang_tidy.py \ + -j 4 \ + --diff-base ${{ steps.diff-base.outputs.ref }} \ + --github-annotations \ + --github-step-summary \ + --comment-output clang-tidy-comment.md + continue-on-error: true + - name: Post clang-tidy results to PR + if: github.event_name == 'pull_request_target' && always() && steps.diff-base.outputs.skip != 'true' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const commentPath = 'clang-tidy-comment.md'; + if (!fs.existsSync(commentPath)) { + core.info('No comment file generated; skipping PR comment.'); + return; + } + const body = fs.readFileSync(commentPath, 'utf8'); + const marker = ''; + const prNumber = context.payload.pull_request.number; + try { + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + }); + const existing = comments.find(c => c.body.includes(marker)); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + core.info(`Updated existing comment ${existing.id}`); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body, + }); + core.info('Created new PR comment'); + } + } catch (err) { + core.warning(`Could not post PR comment: ${err.message}`); + } + - name: Fail if clang-tidy found issues + if: steps.clang-tidy.outcome == 'failure' + run: exit 1 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 1370a5460402..e10a95ecd0a0 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -10,7 +10,7 @@ jobs: runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 @@ -27,60 +27,72 @@ jobs: git config --global --add safe.directory /__w/rocksdb/rocksdb tools/check_format_compatible.sh - uses: "./.github/actions/post-steps" - build-linux-run-microbench: + build-linux-non-shm: if: ${{ github.repository_owner == 'facebook' }} runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb + env: + TEST_TMPDIR: "/tmp/rocksdb_test_tmp" steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: DEBUG_LEVEL=0 make -j32 run_microbench + - run: make V=1 -j32 check - uses: "./.github/actions/post-steps" - build-linux-non-shm: + build-linux-clang-18-asan-ubsan-with-folly: if: ${{ github.repository_owner == 'facebook' }} runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 options: --shm-size=16gb env: - TEST_TMPDIR: "/tmp/rocksdb_test_tmp" + CC: clang-18 + CXX: clang++-18 steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: make V=1 -j32 check + - uses: "./.github/actions/cache-getdeps-downloads" + - uses: "./.github/actions/setup-folly" + - uses: "./.github/actions/build-folly" + - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check - uses: "./.github/actions/post-steps" - build-linux-clang-13-asan-ubsan-with-folly: + build-linux-cmake-with-folly: if: ${{ github.repository_owner == 'facebook' }} runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb - env: - CC: clang-13 - CXX: clang++-13 steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/cache-getdeps-downloads" - uses: "./.github/actions/setup-folly" + - uses: "./.github/actions/cache-folly" + id: cache-folly - uses: "./.github/actions/build-folly" - - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check + with: + cache-hit: ${{ steps.cache-folly.outputs.cache-hit }} + - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make VERBOSE=1 -j20 && ctest -j20)" - uses: "./.github/actions/post-steps" - build-linux-valgrind: + build-linux-release-with-folly: if: ${{ github.repository_owner == 'facebook' }} runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: make V=1 -j32 valgrind_test + - uses: "./.github/actions/cache-getdeps-downloads" + - uses: "./.github/actions/setup-folly" + - run: "DEBUG_LEVEL=0 make -j20 build_folly" + - run: "USE_FOLLY=1 LIB_MODE=static DEBUG_LEVEL=0 V=1 make -j20 release" + - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 -DCMAKE_BUILD_TYPE=Release .. && make VERBOSE=1 -j20 && ctest -j20)" - uses: "./.github/actions/post-steps" build-windows-vs2022-avx2: if: ${{ github.repository_owner == 'facebook' }} @@ -91,15 +103,6 @@ jobs: steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/windows-build-steps" - build-windows-vs2022: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: windows-2022 - env: - CMAKE_GENERATOR: Visual Studio 17 2022 - CMAKE_PORTABLE: 1 - steps: - - uses: actions/checkout@v4.1.0 - - uses: "./.github/actions/windows-build-steps" build-linux-arm-test-full: if: ${{ github.repository_owner == 'facebook' }} runs-on: @@ -110,3 +113,59 @@ jobs: - run: sudo apt-get update && sudo apt-get install -y build-essential libgflags-dev - run: make V=1 J=4 -j4 check - uses: "./.github/actions/post-steps" + build-linux-arm-crashtest: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu-arm + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: sudo apt-get update && sudo apt-get install -y build-essential libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev + - run: sudo mount -o remount,size=16G /dev/shm + - run: sudo dd bs=1048576 count=4096 if=/dev/zero of=/swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile + - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=1800 --max_key=2500000' blackbox_crash_test_with_atomic_flush + - run: rm -rf /dev/shm/rocksdb.* + - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=1800 --max_key=2500000' blackbox_crash_test_with_multiops_wc_txn + - uses: "./.github/actions/post-steps" + build-examples: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - name: Build examples + run: make V=1 -j4 static_lib && cd examples && make V=1 -j4 + - uses: "./.github/actions/post-steps" + build-fuzzers: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 4-core-ubuntu + container: + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - name: Build rocksdb lib + run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 make -j4 static_lib + - name: Build fuzzers + run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer + - uses: "./.github/actions/post-steps" + build-linux-cmake-with-folly-lite: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + container: + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/cache-getdeps-downloads" + - uses: "./.github/actions/setup-folly" + - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 -DCMAKE_CXX_FLAGS=-DGLOG_USE_GLOG_EXPORT .. && make VERBOSE=1 -j20 && ctest -j20)" + - uses: "./.github/actions/post-steps" diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml index 7faaff6637a7..0b5ea4b81d23 100644 --- a/.github/workflows/pr-jobs.yml +++ b/.github/workflows/pr-jobs.yml @@ -1,7 +1,18 @@ name: facebook/rocksdb/pr-jobs on: [push, pull_request] permissions: {} +env: + # Set to a job name to run only that job (on any repo), or leave empty for + # normal behavior (all jobs on facebook repo only). + ONLY_JOB: '' jobs: + config: + runs-on: ubuntu-latest + outputs: + only_job: ${{ steps.set.outputs.only_job }} + steps: + - id: set + run: echo "only_job=$ONLY_JOB" >> "$GITHUB_OUTPUT" # NOTE: multiple workflows would be recommended, but the current GHA UI in # PRs doesn't make it clear when there's an overall error with a workflow, # making it easy to overlook something broken. Grouping everything into one @@ -19,6 +30,10 @@ jobs: # increasing the risk of misconfiguration, especially on forks that might # want to run with this GHA setup. # + # SELECTIVE JOB EXECUTION: Set the ONLY_JOB env var at the top of this file + # to a job name (e.g. "build-linux-clang-tidy") to run only that job, + # bypassing the repository owner check. Leave it empty for normal behavior. + # # DEBUGGING WITH SSH: Temporarily add this as a job step, either before the # step of interest without the "if:" line or after the failing step with the # "if:" line. Then use ssh command printed in CI output. @@ -30,7 +45,8 @@ jobs: # ======================== Fast Initial Checks ====================== # check-format-and-targets: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'check-format-and-targets' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4.1.0 @@ -44,6 +60,10 @@ jobs: run: python -m pip install --upgrade pip - name: Install argparse run: pip install argparse + - name: Install clang-format + run: | + pip install https://files.pythonhosted.org/packages/fb/ac/3c04772acc0257f5730e83adb542b2603c1a62d1315010ab593a980af404/clang_format-21.1.2-py2.py3-none-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + clang-format --version - name: Download clang-format-diff.py run: wget https://rocksdb-deps.s3.us-west-2.amazonaws.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py - name: Check format @@ -52,13 +72,22 @@ jobs: run: make check-buck-targets - name: Simple source code checks run: make check-sources + - name: Sanity check check_format_compatible.sh + run: |- + export TEST_TMPDIR=/dev/shm/rocksdb + rm -rf /dev/shm/rocksdb + mkdir /dev/shm/rocksdb + git reset --hard + git config --global --add safe.directory /__w/rocksdb/rocksdb + SANITY_CHECK=1 LONG_TEST=1 tools/check_format_compatible.sh # ========================= Linux With Tests ======================== # build-linux: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 @@ -66,11 +95,12 @@ jobs: - run: make V=1 J=32 -j32 check - uses: "./.github/actions/post-steps" build-linux-cmake-mingw: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-cmake-mingw' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 4-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 @@ -84,255 +114,175 @@ jobs: which javac && javac -version mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni - uses: "./.github/actions/post-steps" - build-linux-cmake-with-folly: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: - labels: 16-core-ubuntu - container: - image: zjay437/rocksdb:0.6 - options: --shm-size=16gb - env: - CC: gcc-10 - CXX: g++-10 - steps: - - uses: actions/checkout@v4.1.0 - - uses: "./.github/actions/pre-steps" - - uses: "./.github/actions/setup-folly" - - uses: "./.github/actions/build-folly" - - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)" - - uses: "./.github/actions/post-steps" - build-linux-cmake-with-folly-lite-no-test: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: - labels: 16-core-ubuntu - container: - image: zjay437/rocksdb:0.6 - options: --shm-size=16gb - env: - CC: gcc-10 - CXX: g++-10 - steps: - - uses: actions/checkout@v4.1.0 - - uses: "./.github/actions/pre-steps" - - uses: "./.github/actions/setup-folly" - - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 .. && make V=1 -j20)" - - uses: "./.github/actions/post-steps" build-linux-make-with-folly: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-make-with-folly' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb - env: - CC: gcc-10 - CXX: g++-10 steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/cache-getdeps-downloads" - uses: "./.github/actions/setup-folly" + - uses: "./.github/actions/cache-folly" + id: cache-folly - uses: "./.github/actions/build-folly" + with: + cache-hit: ${{ steps.cache-folly.outputs.cache-hit }} - run: USE_FOLLY=1 LIB_MODE=static V=1 make -j32 check - uses: "./.github/actions/post-steps" build-linux-make-with-folly-lite-no-test: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-make-with-folly-lite-no-test' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb - env: - CC: gcc-10 - CXX: g++-10 steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/cache-getdeps-downloads" - uses: "./.github/actions/setup-folly" - - run: USE_FOLLY_LITE=1 V=1 make -j32 all + - run: USE_FOLLY_LITE=1 EXTRA_CXXFLAGS=-DGLOG_USE_GLOG_EXPORT V=1 make -j32 all - uses: "./.github/actions/post-steps" build-linux-cmake-with-folly-coroutines: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-cmake-with-folly-coroutines' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb - env: - CC: gcc-10 - CXX: g++-10 steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" + - uses: "./.github/actions/cache-getdeps-downloads" - uses: "./.github/actions/setup-folly" + - uses: "./.github/actions/cache-folly" + id: cache-folly - uses: "./.github/actions/build-folly" - - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)" + with: + cache-hit: ${{ steps.cache-folly.outputs.cache-hit }} + - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make VERBOSE=1 -j20 && ctest -j20)" - uses: "./.github/actions/post-steps" - build-linux-cmake-with-benchmark: - if: ${{ github.repository_owner == 'facebook' }} + build-linux-cmake-with-benchmark-no-thread-status: + if: needs.config.outputs.only_job == 'build-linux-cmake-with-benchmark-no-thread-status' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 .. && make V=1 -j20 && ctest -j20 + - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 -DCMAKE_CXX_FLAGS=-DNROCKSDB_THREAD_STATUS .. && make VERBOSE=1 -j20 && ctest -j20 - uses: "./.github/actions/post-steps" build-linux-encrypted_env-no_compression: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-encrypted_env-no_compression' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - run: ENCRYPTED_ENV=1 ROCKSDB_DISABLE_SNAPPY=1 ROCKSDB_DISABLE_ZLIB=1 ROCKSDB_DISABLE_BZIP=1 ROCKSDB_DISABLE_LZ4=1 ROCKSDB_DISABLE_ZSTD=1 make V=1 J=32 -j32 check - - run: "./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression\n" + - run: "./sst_dump --help | grep -E -q 'Supported built-in compression types: kNoCompression$' # Verify no compiled in compression\n" - uses: "./.github/actions/post-steps" # ======================== Linux No Test Runs ======================= # build-linux-release: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-release' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - run: make V=1 -j32 LIB_MODE=shared release - run: ls librocksdb.so - - run: "./db_stress --version" + - run: "./trace_analyzer --version" # A tool dependent on gflags that can run in release build - run: make clean - - run: make V=1 -j32 release + - run: USE_RTTI=1 make V=1 -j32 release - run: ls librocksdb.a - - run: "./db_stress --version" + - run: "./trace_analyzer --version" - run: make clean - run: apt-get remove -y libgflags-dev - run: make V=1 -j32 LIB_MODE=shared release - run: ls librocksdb.so - - run: if ./db_stress --version; then false; else true; fi + - run: if ./trace_analyzer --version; then false; else true; fi - run: make clean - - run: make V=1 -j32 release + - run: USE_RTTI=1 make V=1 -j32 release - run: ls librocksdb.a - - run: if ./db_stress --version; then false; else true; fi - - uses: "./.github/actions/post-steps" - build-linux-release-rtti: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: - labels: 8-core-ubuntu - container: - image: zjay437/rocksdb:0.6 - options: --shm-size=16gb - steps: - - uses: actions/checkout@v4.1.0 - - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench - - run: "./db_stress --version" - - run: make clean - - run: apt-get remove -y libgflags-dev - - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench - - run: if ./db_stress --version; then false; else true; fi - build-examples: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: - labels: 4-core-ubuntu - container: - image: zjay437/rocksdb:0.6 - options: --shm-size=16gb - steps: - - uses: actions/checkout@v4.1.0 - - uses: "./.github/actions/pre-steps" - - name: Build examples - run: make V=1 -j4 static_lib && cd examples && make V=1 -j4 - - uses: "./.github/actions/post-steps" - build-fuzzers: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: - labels: 4-core-ubuntu - container: - image: zjay437/rocksdb:0.6 - options: --shm-size=16gb - steps: - - uses: actions/checkout@v4.1.0 - - uses: "./.github/actions/pre-steps" - - name: Build rocksdb lib - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j4 static_lib - - name: Build fuzzers - run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer - - uses: "./.github/actions/post-steps" - build-linux-clang-no_test_run: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: - labels: 8-core-ubuntu - container: - image: zjay437/rocksdb:0.6 - options: --shm-size=16gb - steps: - - uses: actions/checkout@v4.1.0 - - run: CC=clang CXX=clang++ USE_CLANG=1 PORTABLE=1 make V=1 -j16 all + - run: if ./trace_analyzer --version; then false; else true; fi - uses: "./.github/actions/post-steps" build-linux-clang-13-no_test_run: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-clang-13-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: - labels: 16-core-ubuntu - container: - image: zjay437/rocksdb:0.6 - options: --shm-size=16gb - steps: - - uses: actions/checkout@v4.1.0 - - uses: "./.github/actions/pre-steps" - - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j32 all microbench - - uses: "./.github/actions/post-steps" - build-linux-gcc-8-no_test_run: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: - labels: 16-core-ubuntu + labels: 8-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: CC=gcc-8 CXX=g++-8 V=1 make -j32 all + # FIXME: get back to "all microbench" targets + - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 EXTRA_CXXFLAGS=-stdlib=libc++ EXTRA_LDFLAGS=-stdlib=libc++ make -j32 shared_lib + - run: make clean + # FIXME: get back to "release" target + - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 EXTRA_CXXFLAGS=-stdlib=libc++ EXTRA_LDFLAGS=-stdlib=libc++ DEBUG_LEVEL=0 make -j32 shared_lib - uses: "./.github/actions/post-steps" - build-linux-gcc-10-cxx20-no_test_run: - if: ${{ github.repository_owner == 'facebook' }} + build-linux-clang-18-no_test_run: + if: needs.config.outputs.only_job == 'build-linux-clang-18-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: CC=gcc-10 CXX=g++-10 V=1 ROCKSDB_CXX_STANDARD=c++20 make -j32 all + - run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 make -j32 all microbench + - run: make clean + - run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 DEBUG_LEVEL=0 make -j32 release - uses: "./.github/actions/post-steps" - build-linux-gcc-11-no_test_run: - if: ${{ github.repository_owner == 'facebook' }} + build-linux-gcc-14-no_test_run: + if: needs.config.outputs.only_job == 'build-linux-gcc-14-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench + - run: CC=gcc-14 CXX=g++-14 V=1 make -j32 all microbench - uses: "./.github/actions/post-steps" + # ======================== Linux Other Checks ======================= # - build-linux-clang10-clang-analyze: - if: ${{ github.repository_owner == 'facebook' }} + build-linux-clang18-clang-analyze: + if: needs.config.outputs.only_job == 'build-linux-clang18-clang-analyze' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze + - run: CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-18" CLANG_SCAN_BUILD=scan-build-18 USE_CLANG=1 make V=1 -j32 analyze - uses: "./.github/actions/post-steps" - name: compress test report run: tar -cvzf scan_build_report.tar.gz scan_build_report @@ -341,8 +291,10 @@ jobs: with: name: scan-build-report path: scan_build_report.tar.gz + build-linux-unity-and-headers: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-unity-and-headers' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 4-core-ubuntu container: @@ -356,11 +308,12 @@ jobs: - run: make V=1 -j8 -k check-headers - uses: "./.github/actions/post-steps" build-linux-mini-crashtest: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-mini-crashtest' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 4-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 @@ -368,119 +321,122 @@ jobs: - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush - uses: "./.github/actions/post-steps" # ======================= Linux with Sanitizers ===================== # - build-linux-clang10-asan: - if: ${{ github.repository_owner == 'facebook' }} + build-linux-clang18-asan-ubsan: + if: needs.config.outputs.only_job == 'build-linux-clang18-asan-ubsan' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 32-core-ubuntu container: - image: zjay437/rocksdb:0.6 - options: --shm-size=16gb - steps: - - uses: actions/checkout@v4.1.0 - - uses: "./.github/actions/pre-steps" - - run: COMPILE_WITH_ASAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check - - uses: "./.github/actions/post-steps" - build-linux-clang10-ubsan: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: - labels: 16-core-ubuntu - container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: COMPILE_WITH_UBSAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 ubsan_check + - run: COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j40 check - uses: "./.github/actions/post-steps" - build-linux-clang13-mini-tsan: - if: ${{ github.repository_owner == 'facebook' }} + build-linux-clang18-mini-tsan: + if: needs.config.outputs.only_job == 'build-linux-clang18-mini-tsan' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 32-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:24.0 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: COMPILE_WITH_TSAN=1 CC=clang-13 CXX=clang++-13 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check + - run: COMPILE_WITH_TSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check - uses: "./.github/actions/post-steps" build-linux-static_lib-alt_namespace-status_checked: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-static_lib-alt_namespace-status_checked' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 16-core-ubuntu container: - image: zjay437/rocksdb:0.6 + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/pre-steps" - - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check + - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_USE_STD_SEMAPHORES -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check - uses: "./.github/actions/post-steps" # ========================= MacOS build only ======================== # build-macos: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: macos-13 + if: needs.config.outputs.only_job == 'build-macos' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config + runs-on: macos-15-xlarge env: ROCKSDB_DISABLE_JEMALLOC: 1 steps: - uses: actions/checkout@v4.1.0 - uses: maxim-lobanov/setup-xcode@v1.6.0 with: - xcode-version: 14.3.1 + xcode-version: 16.4.0 - uses: "./.github/actions/increase-max-open-files-on-macos" - uses: "./.github/actions/install-gflags-on-macos" - uses: "./.github/actions/pre-steps-macos" - name: Build - run: ulimit -S -n `ulimit -H -n` && make V=1 J=16 -j16 all + run: ulimit -S -n `ulimit -H -n` && make V=1 J=16 -j8 all - uses: "./.github/actions/post-steps" # ========================= MacOS with Tests ======================== # build-macos-cmake: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: macos-13 + if: needs.config.outputs.only_job == 'build-macos-cmake' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config + runs-on: macos-15-xlarge strategy: matrix: - run_even_tests: [true, false] + run_sharded_tests: [0, 1, 2, 3] steps: - uses: actions/checkout@v4.1.0 - uses: maxim-lobanov/setup-xcode@v1.6.0 with: - xcode-version: 14.3.1 + xcode-version: 16.4.0 - uses: "./.github/actions/increase-max-open-files-on-macos" - uses: "./.github/actions/install-gflags-on-macos" - uses: "./.github/actions/pre-steps-macos" - name: cmake generate project file run: ulimit -S -n `ulimit -H -n` && mkdir build && cd build && cmake -DWITH_GFLAGS=1 .. - name: Build tests - run: cd build && make V=1 -j16 - - name: Run even tests - run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 0,,2 - if: ${{ matrix.run_even_tests }} - - name: Run odd tests - run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 1,,2 - if: ${{ ! matrix.run_even_tests }} + run: cd build && make VERBOSE=1 -j8 + - name: Run shard 0 out of 4 test shards + run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 0,,4 + if: ${{ matrix.run_sharded_tests == 0 }} + - name: Run shard 1 out of 4 test shards + run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 1,,4 + if: ${{ matrix.run_sharded_tests == 1 }} + - name: Run shard 2 out of 4 test shards + run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 2,,4 + if: ${{ matrix.run_sharded_tests == 2 }} + - name: Run shard 3 out of 4 test shards + run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 3,,4 + if: ${{ matrix.run_sharded_tests == 3 }} - uses: "./.github/actions/post-steps" # ======================== Windows with Tests ======================= # # NOTE: some windows jobs are in "nightly" to save resources - build-windows-vs2019: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: windows-2019 + build-windows-vs2022: + if: needs.config.outputs.only_job == 'build-windows-vs2022' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config + runs-on: windows-8-core env: - CMAKE_GENERATOR: Visual Studio 16 2019 + CMAKE_GENERATOR: Visual Studio 17 2022 CMAKE_PORTABLE: 1 steps: - uses: actions/checkout@v4.1.0 - uses: "./.github/actions/windows-build-steps" # ============================ Java Jobs ============================ # build-linux-java: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-java' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 4-core-ubuntu container: - image: evolvedbinary/rocksjava:centos6_x64-be + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: # The docker image is intentionally based on an OS that has an older GLIBC version. # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step. + # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.1 + # until a more appropriate docker image with C++20 support is made. - name: Checkout env: GH_TOKEN: ${{ github.token }} @@ -497,18 +453,22 @@ jobs: which java && java -version which javac && javac -version - name: Test RocksDBJava - run: scl enable devtoolset-7 'make V=1 J=8 -j8 jtest' - # NOTE: post-steps skipped because of compatibility issues with docker image + # NOTE: replaced scl enable devtoolset-7 'make V=1 J=8 -j8 jtest' + run: make V=1 J=8 -j8 jtest + # post-steps skipped because of compatibility issues with docker image build-linux-java-static: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-java-static' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 4-core-ubuntu container: - image: evolvedbinary/rocksjava:centos6_x64-be + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 options: --shm-size=16gb steps: # The docker image is intentionally based on an OS that has an older GLIBC version. # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step. + # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.1 + # until a more appropriate docker image with C++20 support is made. - name: Checkout env: GH_TOKEN: ${{ github.token }} @@ -525,11 +485,13 @@ jobs: which java && java -version which javac && javac -version - name: Build RocksDBJava Static Library - run: scl enable devtoolset-7 'make V=1 J=8 -j8 rocksdbjavastatic' - # NOTE: post-steps skipped because of compatibility issues with docker image + # NOTE: replaced scl enable devtoolset-7 'make V=1 J=8 -j8 rocksdbjavastatic' + run: make V=1 J=8 -j8 rocksdbjavastatic + # post-steps skipped because of compatibility issues with docker image build-macos-java: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: macos-13 + if: needs.config.outputs.only_job == 'build-macos-java' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config + runs-on: macos-15-xlarge env: JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home" ROCKSDB_DISABLE_JEMALLOC: 1 @@ -537,7 +499,7 @@ jobs: - uses: actions/checkout@v4.1.0 - uses: maxim-lobanov/setup-xcode@v1.6.0 with: - xcode-version: 14.3.1 + xcode-version: 16.4.0 - uses: "./.github/actions/increase-max-open-files-on-macos" - uses: "./.github/actions/install-gflags-on-macos" - uses: "./.github/actions/install-jdk8-on-macos" @@ -551,15 +513,16 @@ jobs: run: make V=1 J=16 -j16 jtest - uses: "./.github/actions/post-steps" build-macos-java-static: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: macos-13 + if: needs.config.outputs.only_job == 'build-macos-java-static' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config + runs-on: macos-15-xlarge env: JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home" steps: - uses: actions/checkout@v4.1.0 - uses: maxim-lobanov/setup-xcode@v1.6.0 with: - xcode-version: 14.3.1 + xcode-version: 16.4.0 - uses: "./.github/actions/increase-max-open-files-on-macos" - uses: "./.github/actions/install-gflags-on-macos" - uses: "./.github/actions/install-jdk8-on-macos" @@ -573,15 +536,16 @@ jobs: run: make V=1 J=16 -j16 rocksdbjavastaticosx - uses: "./.github/actions/post-steps" build-macos-java-static-universal: - if: ${{ github.repository_owner == 'facebook' }} - runs-on: macos-13 + if: needs.config.outputs.only_job == 'build-macos-java-static-universal' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config + runs-on: macos-15-xlarge env: JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home" steps: - uses: actions/checkout@v4.1.0 - uses: maxim-lobanov/setup-xcode@v1.6.0 with: - xcode-version: 14.3.1 + xcode-version: 16.4.0 - uses: "./.github/actions/increase-max-open-files-on-macos" - uses: "./.github/actions/install-gflags-on-macos" - uses: "./.github/actions/install-jdk8-on-macos" @@ -595,11 +559,12 @@ jobs: run: make V=1 J=16 -j16 rocksdbjavastaticosx_ub - uses: "./.github/actions/post-steps" build-linux-java-pmd: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-java-pmd' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 4-core-ubuntu container: - image: evolvedbinary/rocksjava:rockylinux8_x64-be + image: evolvedbinary/rocksjava:alpine3_x64-be options: --shm-size=16gb steps: - uses: actions/checkout@v4.1.0 @@ -621,7 +586,8 @@ jobs: name: maven-site path: "${{ github.workspace }}/java/target/site" build-linux-arm: - if: ${{ github.repository_owner == 'facebook' }} + if: needs.config.outputs.only_job == 'build-linux-arm' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook') + needs: config runs-on: labels: 4-core-ubuntu-arm steps: diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml new file mode 100644 index 000000000000..37d36513a783 --- /dev/null +++ b/.github/workflows/weekly.yml @@ -0,0 +1,20 @@ +name: facebook/rocksdb/weekly +on: + schedule: + - cron: 0 9 * * 0 + workflow_dispatch: +permissions: {} +jobs: + build-linux-valgrind: + if: ${{ github.repository_owner == 'facebook' }} + runs-on: + labels: 16-core-ubuntu + timeout-minutes: 840 + container: + image: ghcr.io/facebook/rocksdb_ubuntu:22.1 + options: --shm-size=16gb + steps: + - uses: actions/checkout@v4.1.0 + - uses: "./.github/actions/pre-steps" + - run: make V=1 -j20 valgrind_test + - uses: "./.github/actions/post-steps" diff --git a/BUCK b/BUCK index bffed60e4add..c05b7bb33d3a 100644 --- a/BUCK +++ b/BUCK @@ -1,12 +1,14 @@ # This file @generated by: #$ python3 buckifier/buckify_rocksdb.py # --> DO NOT EDIT MANUALLY <-- -# This file is a Facebook-specific integration for buck builds, so can -# only be validated by Facebook employees. +# This file is a Meta-specific integration for buck builds, so can +# only be validated by Meta employees. load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper") load("@fbcode_macros//build_defs:export_files.bzl", "export_file") +oncall("rocksdb_point_of_contact") + cpp_library_wrapper(name="rocksdb_lib", srcs=[ "cache/cache.cc", "cache/cache_entry_roles.cc", @@ -88,6 +90,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "db/memtable_list.cc", "db/merge_helper.cc", "db/merge_operator.cc", + "db/multi_scan.cc", "db/output_validator.cc", "db/periodic_task_scheduler.cc", "db/range_del_aggregator.cc", @@ -113,6 +116,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "db/write_controller.cc", "db/write_stall_stats.cc", "db/write_thread.cc", + "db_stress_tool/db_stress_compression_manager.cc", "env/composite_env.cc", "env/env.cc", "env/env_chroot.cc", @@ -214,7 +218,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "table/cuckoo/cuckoo_table_builder.cc", "table/cuckoo/cuckoo_table_factory.cc", "table/cuckoo/cuckoo_table_reader.cc", - "table/external_table_reader.cc", + "table/external_table.cc", "table/format.cc", "table/get_context.cc", "table/iterator.cc", @@ -249,6 +253,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "trace_replay/trace_record_result.cc", "trace_replay/trace_replay.cc", "util/async_file_reader.cc", + "util/auto_tune_compressor.cc", "util/build_version.cc", "util/cleanable.cc", "util/coding.cc", @@ -263,10 +268,12 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "util/dynamic_bloom.cc", "util/file_checksum_helper.cc", "util/hash.cc", + "util/io_dispatcher_imp.cc", "util/murmurhash.cc", "util/random.cc", "util/rate_limiter.cc", "util/ribbon_config.cc", + "util/simple_mixed_compressor.cc", "util/slice.cc", "util/status.cc", "util/stderr_logger.cc", @@ -415,16 +422,19 @@ cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[ cpp_library_wrapper(name="rocksdb_cache_bench_tools_lib", srcs=["cache/cache_bench_tool.cc"], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False) +cpp_library_wrapper(name="rocksdb_point_lock_bench_tools_lib", srcs=["utilities/transactions/lock/point/point_lock_bench_tool.cc"], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False) + rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[ "db_stress_tool/batched_ops_stress.cc", "db_stress_tool/cf_consistency_stress.cc", "db_stress_tool/db_stress_common.cc", + "db_stress_tool/db_stress_compaction_service.cc", + "db_stress_tool/db_stress_compression_manager.cc", "db_stress_tool/db_stress_driver.cc", "db_stress_tool/db_stress_filters.cc", "db_stress_tool/db_stress_gflags.cc", "db_stress_tool/db_stress_listener.cc", "db_stress_tool/db_stress_shared_state.cc", - "db_stress_tool/db_stress_stat.cc", "db_stress_tool/db_stress_test_base.cc", "db_stress_tool/db_stress_tool.cc", "db_stress_tool/db_stress_wide_merge_operator.cc", @@ -446,6 +456,8 @@ cpp_binary_wrapper(name="db_bench", srcs=["tools/db_bench.cc"], deps=[":rocksdb_ cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False) +cpp_binary_wrapper(name="point_lock_bench", srcs=["utilities/transactions/lock/point/point_lock_bench.cc"], deps=[":rocksdb_point_lock_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False) + cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True) cpp_binary_wrapper(name="db_basic_bench", srcs=["microbench/db_basic_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True) @@ -4709,6 +4721,12 @@ cpp_unittest_wrapper(name="compressed_secondary_cache_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="compression_test", + srcs=["util/compression_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="configurable_test", srcs=["options/configurable_test.cc"], deps=[":rocksdb_test_lib"], @@ -4805,6 +4823,12 @@ cpp_unittest_wrapper(name="db_clip_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="db_compaction_abort_test", + srcs=["db/db_compaction_abort_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="db_compaction_filter_test", srcs=["db/db_compaction_filter_test.cc"], deps=[":rocksdb_test_lib"], @@ -4829,6 +4853,12 @@ cpp_unittest_wrapper(name="db_encryption_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="db_etc3_test", + srcs=["db/db_etc3_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="db_flush_test", srcs=["db/db_flush_test.cc"], deps=[":rocksdb_test_lib"], @@ -5185,6 +5215,18 @@ cpp_unittest_wrapper(name="inlineskiplist_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="interval_test", + srcs=["util/interval_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + +cpp_unittest_wrapper(name="io_dispatcher_test", + srcs=["util/io_dispatcher_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="io_posix_test", srcs=["env/io_posix_test.cc"], deps=[":rocksdb_test_lib"], @@ -5365,6 +5407,12 @@ cpp_unittest_wrapper(name="plain_table_db_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="point_lock_manager_stress_test", + srcs=["utilities/transactions/lock/point/point_lock_manager_stress_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="point_lock_manager_test", srcs=["utilities/transactions/lock/point/point_lock_manager_test.cc"], deps=[":rocksdb_test_lib"], @@ -5683,6 +5731,12 @@ cpp_unittest_wrapper(name="write_prepared_transaction_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="write_prepared_transaction_test_seqno", + srcs=["utilities/transactions/write_prepared_transaction_test_seqno.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="write_unprepared_transaction_test", srcs=["utilities/transactions/write_unprepared_transaction_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000000..39ef7dbc380d --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,312 @@ +# RocksDB Code Generation and Review Guidance + +This document provides guidance for generating and reviewing code in the RocksDB project, derived from analysis of code review feedback across hundreds of complex merged Pull Requests. Use this as a reference when writing code with AI assistants or conducting code reviews. + +--- + +## General Best Practices + +### Code Quality and Maintainability + +**Clarity and Readability:** Write clear, self-documenting code. Use meaningful variable names, add comments for complex logic, and structure code to minimize cognitive load. Avoid clever tricks that sacrifice readability for marginal performance gains unless absolutely necessary. + +**Consistent Style:** Follow existing code style conventions. RocksDB uses `.clang-format` for formatting, specific naming conventions, and structural patterns. Deviations from these patterns are frequently flagged in reviews. + +**Error Handling:** Ensure robust error handling throughout the codebase. Use RocksDB's `Status` type consistently, propagate errors appropriately, and avoid silently ignoring failures. Reviewers pay close attention to edge cases and failure modes. + +### Testing Philosophy + +**Comprehensive Coverage:** Every change should include appropriate test coverage. This includes unit tests for isolated functionality, integration tests for component interactions, and stress tests for concurrency and performance validation. Reviewers will ask for additional tests if coverage is insufficient. + +**Edge Cases and Failure Modes:** Tests should explicitly cover edge cases, boundary conditions, and potential failure scenarios. This is especially important for changes affecting core database operations, compaction, or recovery logic. + +**Platform-Specific Testing:** RocksDB supports multiple platforms (Linux, Windows, macOS) and compilers (GCC, Clang, MSVC). Changes should be tested across relevant platforms, particularly when touching platform-specific code or using compiler-specific features. + +### Performance Considerations + +**⚠️ PERFORMANCE IS CRITICAL:** RocksDB is a high-performance storage engine where every CPU cycle and memory access matters. When writing code, always evaluate from a performance perspective. This is not optional—performance-aware coding is a fundamental requirement for all contributions. + +**Benchmarking and Profiling:** Performance claims should be backed by empirical evidence. Use RocksDB's benchmarking tools (e.g., `db_bench`) to validate improvements. Reviewers will request benchmark results for changes that could impact performance. + +**Memory Allocation:** Minimize dynamic memory allocations, especially in hot paths. Prefer stack allocation over heap allocation. Reuse buffers when possible. Consider using arena allocators or memory pools for frequent small allocations. Every `new`, `malloc`, or container resize has a cost. + +**Memory Copy:** Avoid unnecessary memory copies. Use move semantics, `std::string_view`, `Slice`, and pass-by-reference where appropriate. Be aware of implicit copies in STL containers and function returns. Prefer in-place operations over copy-and-modify patterns. + +**CPU Cache Efficiency:** Design data structures and access patterns to be cache-friendly. Keep frequently accessed data together (data locality). Prefer sequential memory access over random access. Be mindful of cache line sizes (typically 64 bytes) and avoid false sharing in concurrent code. Consider struct packing and field ordering to improve cache utilization. + +**Loop Optimization:** Look for opportunities to collapse nested loops, reduce loop overhead, and minimize branch mispredictions. Hoist invariant computations out of loops. Consider loop unrolling for tight inner loops. Batch operations when possible to amortize per-operation overhead. + +**SIMD and Vectorization:** Leverage SIMD instructions (SSE, AVX) for data-parallel operations when appropriate. Structure data to enable auto-vectorization by the compiler. Consider explicit SIMD intrinsics for critical hot paths like checksum computation, encoding/decoding, and bulk data processing. + +**Branch Prediction:** Minimize unpredictable branches in hot paths. Use `LIKELY`/`UNLIKELY` macros to hint branch prediction. Consider branchless alternatives for simple conditionals. Order switch cases and if-else chains by frequency. + +**Memory and Resource Management:** Be mindful of memory allocations, especially in hot paths. Use RAII patterns, smart pointers, and RocksDB's memory management utilities appropriately. + +**Hot Path Analysis:** When deciding how aggressively to optimize code, consider whether it's on a hot path: +- **Hot path** (executed thousands+ times, e.g., data access, iteration, compaction loops): Performance is paramount. Apply all optimization techniques—loop collapsing, SIMD, cache optimization, pre-allocation, etc. The cost of each operation is multiplied by execution frequency. +- **Cold path** (executed rarely, e.g., DB open, configuration parsing, error handling): Maintainability and clarity are more important. Prefer readable code over micro-optimizations. Complex optimizations here add maintenance burden with negligible performance benefit. +- **Warm path** (moderate frequency): Balance both concerns. Use profiling data to guide optimization decisions. + +**Avoid Premature Optimization:** While performance is critical, focus on correctness first, then optimize based on profiling data. However, be performance-aware from the start—choosing the right algorithm and data structure upfront is not premature optimization. Use the hot path analysis above to decide how much optimization effort is warranted. + +### API Design and Compatibility + +**Backwards Compatibility:** RocksDB maintains strong backwards compatibility guarantees. Breaking changes are rare and require extensive justification. When deprecating features, follow the project's deprecation policy (typically spanning multiple releases). + +**API Consistency:** New APIs should be consistent with existing patterns. Use similar naming conventions, parameter ordering, and return types. Reviewers will suggest changes to improve consistency with the broader codebase. + +**Documentation:** Public APIs must be thoroughly documented. Include usage examples, parameter descriptions, and notes on thread safety, performance characteristics, and compatibility considerations. + +--- + +## Component-Specific Guidance + +### Database Core (`db`) + +The database core handles write-ahead logging (WAL), memtables, compaction, and recovery. This component receives the most scrutiny in code reviews. + +**Concurrency and Thread Safety:** Database operations are highly concurrent. Reviewers carefully examine locking strategies, atomic operations, and memory ordering. Document synchronization assumptions clearly. Use appropriate memory ordering semantics (`acquire`/`release` vs. `seq_cst`). + +**Compaction Logic:** Changes to compaction are complex and high-risk. Ensure that compaction logic respects configured parameters, handles edge cases (empty databases, single-file compactions), and maintains correctness under concurrent operations. + +**Error Propagation:** Database operations can fail in many ways (I/O errors, corruption, resource exhaustion). Ensure that errors are properly propagated, logged, and handled. Avoid assertions in production code paths. + +**Testing:** Database core changes require extensive testing, including unit tests, integration tests, and stress tests. Test with various configurations, compaction styles, and concurrent workloads. + +### Public Headers (`include`) + +Public headers define RocksDB's API surface. Changes here have the highest compatibility impact. + +**API Design:** New APIs should be intuitive, consistent with existing patterns, and well-documented. Consider how the API will be used in practice and avoid adding unnecessary complexity. + +**Backwards Compatibility:** Breaking changes to public APIs require extensive justification and a deprecation plan. Maintain ABI compatibility for bug fixes and patch releases. + +**Documentation:** Every public API must be thoroughly documented with usage examples, parameter descriptions, and notes on thread safety and performance characteristics. + +**Deprecation:** When deprecating APIs, follow the project's policy. Mark deprecated APIs clearly, provide migration guidance, and maintain support for at least one major release. + +### Internal Utilities (`util`) + +Internal utilities provide common functionality used throughout the codebase. + +**Code Reuse:** Utilities should be general-purpose and reusable. Avoid duplicating functionality that already exists elsewhere in the codebase. + +**Error Handling:** Utility functions should handle errors robustly and propagate them appropriately. Consider edge cases like overflow, underflow, and invalid inputs. + +**Testing:** Utility functions should have comprehensive test coverage, including edge cases and failure modes. Consider adding death tests for assertions. + +**Performance:** Utilities are often used in hot paths. Ensure that implementations are efficient and avoid unnecessary allocations or copies. + +### Table Management (`table`) + +Table management handles SST file format, block-based tables, and table readers/writers. + +**Block Format and Checksums:** Changes to block format require extreme care. Ensure that checksums are computed and verified correctly. Test with various compression algorithms and block sizes. + +**Iterator Correctness:** Table iterators are used throughout the codebase. Ensure that iterator semantics (Seek, Next, Prev) are correct, especially at boundaries and with deletions. + +**Caching and Prefetching:** Table readers interact with the block cache and prefetching logic. Ensure that cache keys are unique and that prefetching respects configured limits. + +**Performance:** Table operations are performance-critical. Benchmark changes that could impact read or write performance. + +### Utilities (`utilities`) + +Utilities include optional features like transactions, backup engine, and checkpoint. + +**Feature Isolation:** Utilities should be self-contained and not introduce unnecessary dependencies on core database internals. + +**Deprecation and Cleanup:** Legacy features are being phased out. When removing deprecated code, ensure that migration paths are documented and that users have sufficient warning. + +**Cross-Platform Compatibility:** Utilities often interact with OS-specific APIs. Ensure that code works on all supported platforms. + +### Options and Configuration (`options`) + +Options define RocksDB's configuration system. + +**Type Safety:** Use appropriate types for options (e.g., `uint32_t` for flags, scoped enums for enumerated values). + +**Deprecation Policy:** When deprecating options, follow the project's policy. Document the deprecation, provide migration guidance, and maintain support for at least one major release. + +**Dynamic Configuration:** Some options can be changed dynamically. Ensure that dynamic changes are thread-safe and take effect correctly. + +**Validation:** Validate option values and provide clear error messages for invalid configurations. + +### Cache (`cache`) + +Cache management is critical for RocksDB's performance. + +**Concurrency:** Cache operations are highly concurrent. Ensure that implementations are thread-safe and use appropriate synchronization primitives. + +**Performance:** Cache operations are in the hot path. Optimize for low latency and high throughput. Benchmark changes carefully. + +**Memory Management:** Cache implementations must manage memory carefully to avoid leaks and excessive allocations. + +**Eviction Policies:** Changes to eviction policies should be well-tested and benchmarked to ensure they improve overall performance. + +--- + +## Code Review Checklist + +When reviewing RocksDB code (or preparing code for review), use this checklist: + +### Correctness +- [ ] Does the change preserve database semantics (e.g., snapshot isolation, key ordering)? +- [ ] Are all error cases handled appropriately? +- [ ] Is the change thread-safe? Are synchronization primitives used correctly? +- [ ] Are there any potential data races or deadlocks? + +### Testing +- [ ] Does the change include appropriate test coverage? +- [ ] Are edge cases and failure modes tested? +- [ ] Have the tests been run on all supported platforms? +- [ ] Are stress tests passing? + +### Performance +- [ ] Are there benchmark results for performance-sensitive changes? +- [ ] Does the change avoid unnecessary allocations or copies? +- [ ] Are hot paths optimized appropriately? + +### API and Compatibility +- [ ] Is the change backwards compatible? +- [ ] Are new APIs consistent with existing patterns? +- [ ] Is the public API documented? +- [ ] Are deprecated features handled according to policy? + +### Code Quality +- [ ] Does the code follow RocksDB's style conventions? +- [ ] Is the code clear and maintainable? +- [ ] Are comments and documentation sufficient? +- [ ] Are there any code smells or anti-patterns? + +--- + +## Common Review Feedback Patterns + +The following patterns emerged as frequent sources of review feedback: + +1. **Test Coverage:** Reviewers frequently request additional tests for edge cases, platform-specific behavior, and failure modes. Complex changes require comprehensive test coverage including unit tests, integration tests, and stress tests. + +2. **Error Handling:** Ensure proper error propagation using RocksDB's `Status` type. Avoid silent failures and provide clear error messages that include context about what failed and why. + +3. **API Design:** New APIs should be consistent with existing patterns. Use descriptive names that follow established conventions. Avoid breaking changes without strong justification and a clear deprecation plan. + +4. **Documentation:** Public APIs must be documented with usage examples and notes on thread safety, performance characteristics, and compatibility considerations. Complex internal logic should also be well-commented. + +5. **Performance:** Performance-sensitive changes require benchmark results to validate improvements. Use `db_bench` and other profiling tools to measure impact. Avoid premature optimization that adds complexity without measurable benefit. + +6. **Concurrency:** Thread safety is critical in RocksDB. Document synchronization assumptions clearly. Use appropriate memory ordering semantics. Consider potential race conditions and deadlocks. + +7. **Code Style:** Follow existing conventions for naming, formatting, and structure. Use `.clang-format` for consistent formatting. Prefer scoped enums (`enum class`) over unscoped enums. + +8. **Backwards Compatibility:** RocksDB maintains strong compatibility guarantees. Breaking changes require extensive justification. When deprecating features, provide migration guidance and maintain support across multiple releases. + +9. **Refactoring:** Reviewers appreciate refactoring that improves code readability and maintainability. Look for opportunities to deduplicate code and simplify complex logic. + +10. **Platform Compatibility:** Ensure changes work correctly on all supported platforms (Linux, Windows, macOS) and with all supported compilers (GCC, Clang, MSVC). + +--- + +## Important tips + +### Build system +* There are 3 build system. Make, CMake, BUCK(meta internal). +* When a new .cc file is added, update Makefile, CMakeLists.txt, src.mk, BUCK. +* Don't manually edit BUCK file, after updating src.mk, run + /usr/local/bin/python3 buckifier/buckify_rocksdb.py to update it +* Use make to build and run the test. CMake and BUCK are not used locally. +* Use `make dbg` command to build all of the unit test in debug mode. +* For -j in make command, use the number of CPU cores to decide it. + +### Unit Test +* After all of the unit tests are added, review them and try to extract common + reusable utility functions to reduce code duplication due to copy past between + unit tests. This should be done every time unit test is updated. +* Don't use sleep to wait for certain events to happen. This will cause test to + be flaky. Instead, use sync point to synchronize thread progress. +* Cap unit test execution with 60 seconds timeout. +* When there are multiple unit tests need to be executed, try to use + gtest_parallel.py if available. E.g. + python3 ${GTEST_PARALLEL}/gtest_parallel.py ./table_test + +### Unit test dedup guidelines +* Extract helper functions for repeated patterns such as object + construction, round-trip (encode → decode → verify), and common + assertion sequences. +* Use table-driven tests (struct array + loop) when multiple test cases + share the same logic but differ only in input/expected data. +* Prefer randomized tests over exhaustive parameter permutations. Use + `Random` from `util/random.h` (not `std::mt19937`). Use a time-based + seed with `SCOPED_TRACE("seed=" + std::to_string(seed))` so failures + are reproducible. +* Keep deterministic edge-case tests separate from randomized tests + (error paths, boundary conditions, format verification). +* Methods only used in tests should be private with `friend class` + + `TEST_F` fixture wrappers. In wrappers, always fully qualify the + target method to avoid infinite recursion. + +### Adding new public API + Refer to claude_md/add_public_api.md + +### Adding new option + Refer to claude_md/add_option.md + +### Metrics +* When adding a new feature, evaluate whether there is opportunity to add + metrics. Try to avoid causing performance regression on hot path when adding + metrics. + +### Stress test +* When adding a new feature, make sure stress test covers the new option. + +### DB bench update +* When adding a performance related feature, support it in db_bench + +### Adding release note +* Release note should be kept short at high level for external user consumption. + +### Blog posts (docs/_posts) +* Blog post authors must be defined in `docs/_data/authors.yml` to be displayed + +### Final verification of the change +* Execute make clean to clean all of the changes. +* Execute make check to build all of the changes and execute all of the tests. + Note that executing all of the tests could take multiple minutes. + +### Monitoring make check progress +* Use `make check-progress` to get machine-parseable JSON progress while + `make check` is running. This is useful for Claude Code to monitor long + builds without timeout issues. +* Run `make check` in background, then poll progress: + ```bash + make check & + # Poll periodically: + make check-progress + ``` +* The output shows current phase and progress: + ```json + {"status":"running","phase":"compiling","completed":300,"total":919,...} + {"status":"running","phase":"testing","completed":1500,"total":29962,"failed":0,"percent":5,...} + {"status":"completed","phase":"testing","completed":29962,"total":29962,"failed":0,"percent":100,...} + ``` +* Phases: `compiling` -> `linking` -> `generating` -> `testing` -> `completed` +* Key fields: `status`, `phase`, `completed`, `total`, `failed`, `percent` +* When tests fail, `failed_tests` array shows details (up to 10 failures): + ```json + {"status":"running",...,"failed":3,"failed_tests":[ + {"test":"cache_test-CacheTest.Usage","exit_code":1,"signal":0,"output":"...test log..."}, + {"test":"env_test-EnvTest.Open","exit_code":0,"signal":11,"output":"...Segmentation fault..."} + ]} + ``` +* `exit_code`: non-zero means test assertion failed +* `signal`: non-zero means test was killed (e.g., 9=SIGKILL, 6=SIGABRT, 11=SIGSEGV) +* `output`: last 50 lines of test log including error messages and stack traces + +### Executing benchmark using db_bench +* Since the goal is to measure performance, we need to build a release binary + using `make clean && DEBUG_LEVEL=0 make db_bench`. If there is an engine + crash due to bug, we need to switch back to debug build. Make sure to run + `make clean` before running `make dbg`. + +### Formatting code +* After making change, use `make format-auto` to auto-apply formatting without + interactive prompts (Claude Code friendly). diff --git a/CMakeLists.txt b/CMakeLists.txt index cce07d70fec7..f0e79d9306e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ # # Linux: # -# 1. Install a recent toolchain if you're on a older distro. C++17 required (GCC >= 7, Clang >= 5) +# 1. Install a recent toolchain if you're on a older distro. C++20 required (GCC >= 11, Clang >= 10) # 2. mkdir build; cd build # 3. cmake .. # 4. make -j @@ -80,6 +80,7 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING "Default BUILD_TYPE is ${default_build_type}" FORCE) endif() +message(STATUS "CMAKE_BUILD_TYPE is set to ${CMAKE_BUILD_TYPE}") find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND) @@ -100,7 +101,7 @@ endif() option(ROCKSDB_BUILD_SHARED "Build shared versions of the RocksDB libraries" ON) if( NOT DEFINED CMAKE_CXX_STANDARD ) - set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD 20) endif() include(CMakeDependentOption) @@ -132,7 +133,9 @@ else() option(WITH_GFLAGS "build with GFlags" ON) endif() set(GFLAGS_LIB) - if(WITH_GFLAGS) + # Skip all gflags detection and setup when USE_FOLLY or USE_COROUTINES is enabled + # since Folly provides its own gflags (USE_COROUTINES automatically sets USE_FOLLY) + if(WITH_GFLAGS AND NOT USE_FOLLY AND NOT USE_COROUTINES) # Config with namespace available since gflags 2.2.2 option(GFLAGS_USE_TARGET_NAMESPACE "Use gflags import target with namespace." ON) find_package(gflags CONFIG) @@ -151,6 +154,9 @@ else() include_directories(${GFLAGS_INCLUDE_DIR}) list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB}) add_definitions(-DGFLAGS=1) + elseif(WITH_GFLAGS AND (USE_FOLLY OR USE_COROUTINES)) + # Still set the DGFLAGS=1 define when using Folly since Folly provides gflags + add_definitions(-DGFLAGS=1) endif() if(WITH_SNAPPY) @@ -203,9 +209,20 @@ if(WIN32 AND MSVC) endif() endif() +option(WIN_CI "Accelerate build speed and reduce build artifect size for github CI with MSVC" OFF) + if(MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4996 /wd4100 /wd4324") + if(WIN_CI) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /nologo /EHsc /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /W4 /wd4127 /wd4996 /wd4100 /wd4324 /wd4702") + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4996 /wd4100 /wd4324") + endif() + if(CMAKE_BUILD_TYPE STREQUAL "Release") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DNDEBUG") + message(STATUS "Setting /DNDEBUG as CMAKE_BUILD_TYPE is set to ${CMAKE_BUILD_TYPE}") + endif() else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing -Wno-invalid-offsetof") @@ -313,8 +330,7 @@ if(NOT MSVC) endif() # Check if -latomic is required or not -if (NOT MSVC) - set(CMAKE_REQUIRED_FLAGS "--std=c++17") +if (NOT MSVC AND NOT APPLE) CHECK_CXX_SOURCE_COMPILES(" #include std::atomic x(0); @@ -451,24 +467,33 @@ else() endif() endif() -# Used to run CI build and tests so we can run faster +# Used to run optimized debug build and tests so we can run faster option(OPTDBG "Build optimized debug build with MSVC" OFF) option(WITH_RUNTIME_DEBUG "build with debug version of runtime library" ON) if(MSVC) - if(OPTDBG) + if (WIN_CI) message(STATUS "Debug optimization is enabled") set(CMAKE_CXX_FLAGS_DEBUG "/Oxt") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG:FASTLINK") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG:FASTLINK") else() - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1") - - # Minimal Build is deprecated after MSVC 2015 - if( MSVC_VERSION GREATER 1900 ) - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-") + if(OPTDBG) + message(STATUS "Debug optimization is enabled") + set(CMAKE_CXX_FLAGS_DEBUG "/Oxt") else() - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm") - endif() + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1") + # Minimal Build is deprecated after MSVC 2015 + if( MSVC_VERSION GREATER 1900 ) + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-") + else() + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm") + endif() + endif() + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG") endif() + if(WITH_RUNTIME_DEBUG) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d") else() @@ -476,8 +501,6 @@ if(MSVC) endif() set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /${RUNTIME_LIBRARY}") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG") endif() if(CMAKE_COMPILER_IS_GNUCXX) @@ -629,6 +652,12 @@ if(USE_FOLLY) ${FOLLY_INST_PATH}/lib/cmake/folly/folly-targets.cmake) include(${FOLLY_INST_PATH}/lib/cmake/folly/folly-config.cmake) + + # Fix gflags library name for debug builds + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath=${GFLAGS_INST_PATH}/lib") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${GFLAGS_INST_PATH}/lib/libgflags_debug.so.2.2") + endif() endif() add_compile_definitions(USE_FOLLY FOLLY_NO_CONFIG HAVE_CXX11_ATOMIC) @@ -721,6 +750,7 @@ set(SOURCES db/memtable_list.cc db/merge_helper.cc db/merge_operator.cc + db/multi_scan.cc db/output_validator.cc db/periodic_task_scheduler.cc db/range_del_aggregator.cc @@ -746,6 +776,7 @@ set(SOURCES db/write_controller.cc db/write_stall_stats.cc db/write_thread.cc + db_stress_tool/db_stress_compression_manager.cc env/composite_env.cc env/env.cc env/env_chroot.cc @@ -835,7 +866,7 @@ set(SOURCES table/cuckoo/cuckoo_table_builder.cc table/cuckoo/cuckoo_table_factory.cc table/cuckoo/cuckoo_table_reader.cc - table/external_table_reader.cc + table/external_table.cc table/format.cc table/get_context.cc table/iterator.cc @@ -874,17 +905,20 @@ set(SOURCES trace_replay/trace_record.cc trace_replay/trace_replay.cc util/async_file_reader.cc + util/auto_tune_compressor.cc util/cleanable.cc util/coding.cc util/compaction_job_stats_impl.cc util/comparator.cc util/compression.cc + util/simple_mixed_compressor.cc util/compression_context_cache.cc util/concurrent_task_limiter_impl.cc util/crc32c.cc util/data_structure.cc util/dynamic_bloom.cc util/hash.cc + util/io_dispatcher_imp.cc util/murmurhash.cc util/random.cc util/rate_limiter.cc @@ -1065,12 +1099,21 @@ if(USE_FOLLY_LITE) third-party/folly/folly/synchronization/DistributedMutex.cpp third-party/folly/folly/synchronization/ParkingLot.cpp) include_directories(${PROJECT_SOURCE_DIR}/third-party/folly) + # Add boost to the include path exec_program(python3 ${PROJECT_SOURCE_DIR}/third-party/folly ARGS build/fbcode_builder/getdeps.py show-source-dir boost OUTPUT_VARIABLE BOOST_SOURCE_PATH) exec_program(ls ARGS -d ${BOOST_SOURCE_PATH}/boost* OUTPUT_VARIABLE BOOST_INCLUDE_DIR) include_directories(${BOOST_INCLUDE_DIR}) + # Add fmt to the include path + exec_program(python3 ${PROJECT_SOURCE_DIR}/third-party/folly ARGS + build/fbcode_builder/getdeps.py show-source-dir fmt OUTPUT_VARIABLE + FMT_SOURCE_PATH) + exec_program(ls ARGS -d ${FMT_SOURCE_PATH}/fmt*/include OUTPUT_VARIABLE + FMT_INCLUDE_DIR) + include_directories(${FMT_INCLUDE_DIR}) + add_definitions(-DUSE_FOLLY -DFOLLY_NO_CONFIG) list(APPEND THIRDPARTY_LIBS glog) endif() @@ -1339,9 +1382,11 @@ if(WITH_TESTS) db/db_bloom_filter_test.cc db/db_compaction_filter_test.cc db/db_compaction_test.cc + db/db_compaction_abort_test.cc db/db_clip_test.cc db/db_dynamic_level_test.cc db/db_encryption_test.cc + db/db_etc3_test.cc db/db_flush_test.cc db/db_inplace_update_test.cc db/db_io_failure_test.cc @@ -1456,6 +1501,7 @@ if(WITH_TESTS) util/autovector_test.cc util/bloom_test.cc util/coding_test.cc + util/compression_test.cc util/crc32c_test.cc util/defer_test.cc util/dynamic_bloom_test.cc @@ -1499,8 +1545,10 @@ if(WITH_TESTS) utilities/transactions/optimistic_transaction_test.cc utilities/transactions/transaction_test.cc utilities/transactions/lock/point/point_lock_manager_test.cc + utilities/transactions/lock/point/point_lock_manager_stress_test.cc utilities/transactions/write_committed_transaction_ts_test.cc utilities/transactions/write_prepared_transaction_test.cc + utilities/transactions/write_prepared_transaction_test_seqno.cc utilities/transactions/write_unprepared_transaction_test.cc utilities/transactions/lock/range/range_locking_test.cc utilities/transactions/timestamped_snapshot_test.cc @@ -1609,6 +1657,12 @@ if(WITH_BENCHMARK_TOOLS) utilities/persistent_cache/hash_table_bench.cc) target_link_libraries(hash_table_bench${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS}) + + add_executable(point_lock_bench${ARTIFACT_SUFFIX} + utilities/transactions/lock/point/point_lock_bench.cc + utilities/transactions/lock/point/point_lock_bench_tool.cc) + target_link_libraries(point_lock_bench${ARTIFACT_SUFFIX} + ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS}) endif() option(WITH_TRACE_TOOLS "build with trace tools" ON) diff --git a/Directory.Build.props b/Directory.Build.props new file mode 100644 index 000000000000..5862fb2c2f45 --- /dev/null +++ b/Directory.Build.props @@ -0,0 +1,9 @@ + + + + ccache_msvc_compiler.bat + $(MSBuildThisFileDirectory) + true + true + + diff --git a/HISTORY.md b/HISTORY.md index ab8466abd1ce..277ade360676 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,219 @@ # Rocksdb Change Log > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt` +## 10.11.0 (01/23/2026) +### Public API Changes +* New SetOptions API that allows setting options for multiple CFs, avoiding the need to reserialize OPTIONS file for each CF +* Remove remaining pieces of Lua integration + +### Behavior Changes +* The new default for `BlockBasedTableOptions::format_version` is 7, which has been supported since RocksDB 10.4.0 and is required in order to use CompressionManagers supporting custom compression types. + +### Bug Fixes +* Fixed a small performance bug with `format_version=7` when decompressing formats other than Snappy and ZSTD. +* Fixed an infinite compaction loop bug with User-Defined Timestamps (UDT) where bottommost files were repeatedly marked for compaction even though their timestamp could not be collapsed. +* Bugfix for persisted UDT record sequence number zeroing logic. + +## 10.10.0 (12/16/2025) +### Bug Fixes +* Fixed a bug in best-efforts recovery that causes use-after-free crashes when accessing SST files that were cached during the recovery. +* Fix resumable compaction incorrectly allowing resumption from a truncated range deletion that is not well handled currently. +* Fixed a bug in `PosixRandomFileAccess` IO uring submission queue ownership & management. Fix eliminates the false positive 'Bad cqe data' IO errors in `PosixRandomFileAccess::MultiRead` when interleaved with `PosixRandomFileAccess::ReadAsync` on the same thread. + +## 10.9.0 (11/21/2025) +### New Features +* Added an auto-tuning feature for DB manifest file size that also (by default) improves the safety of existing configurations in case `max_manifest_file_size` is repeatedly exceeded. The new recommendation is to set `max_manifest_file_size` to something small like 1MB and tune `max_manifest_space_amp_pct` as needed to balance write amp and space amp in the manifest. Refer to comments on those options in `DBOptions` for details. Both options are (now) mutable. +* Added a new API to support option migration for multiple column families +* Added new option target_file_size_is_upper_bound that makes most compaction output SST files come close to the target file size without exceeding it, rather than commonly exceeding it by some fraction (current behavior). For now the new behavior is off by default, but we expect to enable it by default in the future. +* Add a new option allow_trivial_move in CompactionOptions to allow CompactFiles to perform trivial move if possible. By default the flag of allow_trivial_move is false, so it preserve the original behavior. + +### Public API Changes +* To reduce risk of ODR violations or similar, `ROCKSDB_USING_THREAD_STATUS` has been removed from public headers and replaced with static `const bool ThreadStatus::kEnabled`. Some other uses of conditional compilation have been removed from public API headers to reduce risk of ODR violations or other issues. + +### Behavior Changes +* PosixWritableFile now repositions the seek pointer to the new end of file after a call to Truncate. +* Updated standalone range deletion L0 file compaction behavior to avoid compacting with any newer L0 files (which is expensive and not useful). + +### Bug Fixes +* Fix a bug where compaction with range deletion can persist kTypeMaxValid in MANIFEST as file metadata. kTypeMaxValid is not supposed to be persisted and can change as new value types are introduced. This can cause a forward compatibility issue where older versions of RocksDB don't recognize kTypeMaxValid from newer versions. A new placeholder value type kTypeTruncatedRangeDeletionSentinel is also introduced to replace kTypeMaxValid when reading existing SST files' metadata from MANIFEST. This allows us to strengthen some checks to avoid using kTypeMaxValid in the future. +* Fixed a bug where `DB::GetSortedWalFiles()` could hang when waiting for a purge operation that found nothing to do (potentially triggered by iterator release, flush, compaction, etc.). +* Fixed a bug in MultiScan where `max_sequential_skip_in_iterations` could cause the iterator to seek backward to already-unpinned blocks when the same user key spans multiple data blocks, leading to assertion failures or seg fault. +* Fixed a bug for `WAL_ttl_seconds > 0` use cases where the newest archived WAL files could be incorrectly deleted when the system clock moved backwards. + +### Performance Improvements +* Added optimization that allowed for the asynchronous prefetching of all data outlined in a multiscan iterator. This optimization was applied to the level iterator, which prefetches all data through each of the block-based iterators. + +## 10.8.0 (10/21/2025) +### New Features +* Add kFSPrefetch to FSSupportedOps enum to allow file systems to indicate prefetch support capability, avoiding unnecessary prefetch system calls on file systems that don't support them. +* Added experimental support `OpenAndCompactOptions::allow_resumption` for resumable compaction that persists progress during `OpenAndCompact()`, allowing interrupted compactions to resume from the last progress persitence. The default behavior is to not persist progress. + +### Public API Changes +* Allow specifying output temperature in CompactionOptions +* Added `DB::FlushWAL(const FlushWALOptions&)` as an alternative to `DB::FlushWAL(bool sync)`, where `FlushWALOptions` includes a new `rate_limiter_priority` field (default `Env::IO_TOTAL`) that allows rate limiting and priority passing of manual WAL flush's IO operations. +* The MultiScan API contract is updated. After a multi scan range got prepared with Prepare API call, the following seeks must seek the start of each prepared scan range in order. In addition, when limit is set, upper bound must be set to the same value of limit before each seek + +### Behavior Changes +* `kChangeTemperature` FIFO compaction will now honor `compaction_target_temp` to all levels regardless of `cf_options::last_level_temperature` +* Allow UDIs with a non BytewiseComparator + +### Bug Fixes +* Fix incorrect MultiScan seek error status due to bugs in handling range limit falling between adjacent SST files key range. +* Fix a bug in Page unpinning in MultiScan + +### Performance Improvements +* Fixed a performance regression in LZ4 compression that started in version 10.6.0 + +## 10.7.0 (09/19/2025) +### New Features +* Add the fail_if_no_udi_on_open flag in BlockBasedTableOption to control whether a missing user defined index block in a SST is a hard error or not. +* A new flag memtable_verify_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation. +* Introduce option MultiScanArgs::use_async_io to enable asynchronous I/O during MultiScan, instead of waiting for I/O to be done in Prepare(). +* Add new option `MultiScanArgs::max_prefetch_size` that limits the memory usage of per file pinning of prefetched blocks. +* Improved `sst_dump` by allowing standalone file and directory arguments without `--file=`. Also added new options and better output for `sst_dump --command=recompress`. See `sst_dump --help` + +### Public API Changes +* HyperClockCache with no `estimated_entry_charge` is now production-ready and is the preferred block cache implementation vs. LRUCache. Please consider updating your code to minimize the risk of hitting performance bottlenecks or anomalies from LRUCache. See cache.h for more detail. +* RocksDB now requires a C++20 compatible compiler (GCC >= 11, Clang >= 10, Visual Studio >= 2019), including for any code using RocksDB headers. +* MultiScanArgs used to have a default constructor with default parameter of BytewiseComparator. Now it always requires Comparator in its constructor. + +### Behavior Changes +* The default provided block cache implementation is now HyperClockCache instead of LRUCache, when `block_cache` is nullptr (default) and `no_block_cache==false` (default). We recommend explicitly creating a HyperClockCache block cache based on memory budget and sharing it across all column families and even DB instances. This change could expose previously hidden memory or resource leaks. + +### Bug Fixes +* Reported numbers for compaction and flush CPU usage now include time spent by parallel compression worker threads. This now means compaction/flush CPU usage could exceed the wall clock time. +* Fix a race condition in FIFO size-based compaction where concurrent threads could select the same non-L0 file, causing assertion failures in debug builds or "Cannot delete table file from LSM tree" errors in release builds. +* Fix a bug in RocksDB MultiScan with UDI when one of the scan ranges is determined to be empty by the UDI, which causes incorrect results. + +### Performance Improvements +* Add a new table property "rocksdb.key.smallest.seqno" which records the smallest sequence number of all keys in file. It makes ingesting DB generated files faster by +avoiding scanning the whole file to find the smallest sequence number. +* Add a new experimental PerKeyPointLockManager to improve efficiency under high lock contention. PointLockManager was not efficient when there is high write contention on same key, as it uses a single conditional variable per lock stripe. PerKeyPointLockManager uses per thread conditional variable supporting fifo order. Although this is an experimental feature. By default, it is disabled. A new boolean flag TransactionDBOptions::use_per_key_point_lock_mgr is added to optionally enable it. Search the flag in code for more info. +Together, a new configuration TransactionOptions::deadlock_timeout_us is added, which allows the transaction to wait for a short period before perform deadlock detection. When the workload has low lock contention, the deadlock_timeout_us can be configured to be slightly higher than average transaction execution time, so that transaction would likely be able to take the lock before deadlock detection is performed when it is waiting for a lock. This allows transaction to reduce CPU cost on performing deadlock detection, which could be expensive in CPU time. When the workload has high lock contention, the deadlock_timeout_us can be configured to 0, so that transaction would perform deadlock detection immediately. By default the value is 0 to keep the behavior same as before. +* Majorly improved CPU efficiency and scalability of parallel compression (`CompressionOptions::parallel_threads` > 1), though this efficiency improvement makes parallel compression currently incompatible with UserDefinedIndex and with old setting of `decouple_partitioned_filters=false`. Parallel compression is now considered a production-ready feature. Maximum performance is available with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, but this is not currently recommended because of reported bugs in implementations of `std::counting_semaphore`/`binary_semaphore`. + +## 10.6.0 (08/22/2025) +### New Features +* Introduce column family option `cf_allow_ingest_behind`. This option aims to replace `DBOptions::allow_ingest_behind` to enable ingest behind at the per-CF level. `DBOptions::allow_ingest_behind` is deprecated. +* Introduce `MultiScanArgs::io_coalesce_threshold` to allow a configurable IO coalescing threshold. + +### Public API Changes +* `IngestExternalFileOptions::allow_db_generated_files` now allows files ingestion of any DB generated SST file, instead of only the ones with all keys having sequence number 0. +* `decouple_partitioned_filters = true` is now the default in BlockBasedTableOptions. +* GetTtl() API is now available in TTL DB +* Minimum supported version of LZ4 library is now 1.7.0 (r129 from 2015) +* Some changes to experimental Compressor and CompressionManager APIs +* A new Filesystem::SyncFile function is added for syncing a file that was already written, such as on file ingestion. The default implementation matches previous RocksDB behavior: re-open the file for read-write, sync it, and close it. We recommend overriding for FileSystems that do not require syncing for crash recovery or do not handle (well) re-opening for writes. + +### Behavior Changes +* When `allow_ingest_behind` is enabled, compaction will no longer drop tombstones based on the absence of underlying data. Tombstones will be preserved to apply to ingested files. + +### Bug Fixes +* Files in dropped column family won't be returned to the caller upon successful, offline MANIFEST iteration in `GetFileChecksumsFromCurrentManifest`. +* Fix a bug in MultiScan that causes it to fall back to a normal scan when dictionary compression is enabled. +* Fix a crash in iterator Prepare() when fill_cache=false +* Fix a bug in MultiScan where incorrect results can be returned when a Scan's range is across multiple files. +* Fixed a bug in remote compaction that may mistakenly delete live SST file(s) during the cleanup phase when no keys survive the compaction (all expired) +* Allow a user defined index to be configured from a string. +* Make the User Defined Index interface consistently use the user key format, fixing the previous mixed usage of internal and user key. + +### Performance Improvements +* Small improvement to CPU efficiency of compression using built-in algorithms, and a dramatic efficiency improvement for LZ4HC, based on reusing data structures between invocations. + +## 10.5.0 (07/18/2025) +### Public API Changes +* DB option skip_checking_sst_file_sizes_on_db_open is deprecated, in favor of validating file size in parallel in a thread pool, when db is opened. When DB is opened, with paranoid check enabled, a file with the wrong size would fail the DB open. With paranoid check disabled, the DB open would succeed, the column family with the corrupted file would not be read or write, while the other healthy column families could be read and write normally. When max_open_files option is not set to -1, only a subset of the files will be opened and checked. The rest of the files will be opened and checked when they are accessed. + +### Behavior Changes +* PessimisticTransaction::GetWaitingTxns now returns waiting transaction information even if the current transaction has timed out. This allows the information to be surfaced to users for debugging purposes once it is known that the timeout has occurred. +* A new API GetFileSize is added to FSRandomAccessFile interface class. It uses fstat vs stat on the posix implementation which is more efficient. Caller could use it to get file size faster. This function might be required in the future for FileSystem implementation outside of the RocksDB code base. +* RocksDB now triggers eligible compactions every 12 hours when periodic compaction is configured. This solves a limitation of the compaction trigger mechanism, which would only trigger compaction after specific events like flush, compaction, or SetOptions. + +### Bug Fixes +* Fix a bug in BackupEngine that can crash backup due to a null FSWritableFile passed to WritableFileWriter. +* Fix DB::NewMultiScan iterator to respect the scan upper bound specified in ScanOptions + +### Performance Improvements +* Optimized MultiScan using BlockBasedTable to coalesce I/Os and prefetch all data blocks. + +## 10.4.0 (06/20/2025) +### New Features +* Add a new CF option `memtable_avg_op_scan_flush_trigger` that supports triggering memtable flush when an iterator scans through an expensive range of keys, with the average number of skipped keys from the active memtable exceeding the threshold. +* Vector based memtable now supports concurrent writers (DBOptions::allow_concurrent_memtable_write) #13675. +* Add new experimental `TransactionOptions::large_txn_commit_optimize_byte_threshold` to enable optimizations for large transaction commit by transaction batch data size. +* Add a new option `CompactionOptionsUniversal::reduce_file_locking` and if it's true, auto universal compaction picking will adjust to minimize locking of input files when bottom priority compactions are waiting to run. This can increase the likelihood of existing L0s being selected for compaction, thereby improving write stall and reducing read regression. +* Add new `format_version=7` to aid experimental support of custom compression algorithms with CompressionManager and block-based table. This format version includes changing the format of `TableProperties::compression_name`. + +### Public API Changes +* Change NewExternalTableFactory to return a unique_ptr instead of shared_ptr. +* Add an optional min file size requirement for deletion triggered compaction. It can be specified when creating `CompactOnDeletionCollectorFactory`. + +### Behavior Changes +* `TransactionOptions::large_txn_commit_optimize_threshold` now has default value 0 for disabled. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` now has no effect on transactions. + +### Bug Fixes +* Fix a bug where CreateColumnFamilyWithImport() could miss the SST file for the memtable flush it triggered. The exported CF then may not contain the updates in the memtable when CreateColumnFamilyWithImport() is called. +* Fix iterator operations returning NotImplemented status if disallow_memtable_writes and paranoid_memory_checks CF options are both set. +* Fixed handling of file checksums in IngestExternalFile() to allow providing checksums using recognized but not necessarily the DB's preferred checksum function, to ease migration between checksum functions. + +## 10.3.0 (05/17/2025) +### New Features +* Add new experimental `CompactionOptionsFIFO::allow_trivial_copy_when_change_temperature` along with `CompactionOptionsFIFO::trivial_copy_buffer_size` to allow optimizing FIFO compactions with tiering when kChangeTemperature to move files from source tier FileSystem to another tier FileSystem via trivial and direct copying raw sst file instead of reading thru the content of the SST file then rebuilding the table files. +* Add a new field to Compaction Stats in LOG files for the pre-compression size written to each level. +* Add new experimental `TransactionOptions::large_txn_commit_optimize_threshold` to enable optimizations for large transaction commit with per transaction threshold. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` is deprecated in favor of this transaction option. +* [internal team use only] Allow an application-defined `request_id` to be passed to RocksDB and propagated to the filesystem via IODebugContext + +### Bug Fixes +* Fix a bug where transaction lock upgrade can incorrectly fail with a Deadlock status. This happens when a transaction has a non-zero timeout and tries to upgrade a shared lock that is also held by another transaction. +* Pass wrapped WritableFileWriter pointer to ExternalTableBuilder so that the file checksum can be correctly calculated and returned by SstFileWriter for external table files. +* Fix an infinite-loop bug in transaction locking. This can happen if a transaction reaches lock limit and its time out expires before it attempts to wait for it. +* Fixed a potential data race with `CompressionOptions::parallel_threads > 1` and a `TablePropertiesCollector` overriding `BlockAdd()`. + +## 10.2.0 (04/21/2025) +### New Features +* Provide histogram stats `COMPACTION_PREFETCH_BYTES` to measure number of bytes for RocksDB's prefetching (as opposed to file +system's prefetch) on SST file during compaction read +* A new API DB::GetNewestUserDefinedTimestamp is added to return the newest user defined timestamp seen in a column family +* Introduce API `IngestWriteBatchWithIndex()` for ingesting updates into DB while bypassing memtable writes. This improves performance when writing a large write batch to the DB. +* Add a new CF option `memtable_op_scan_flush_trigger` that triggers a flush of the memtable if an iterator's Seek()/Next() scans over a certain number of invisible entries from the memtable. + +### Public API Changes +* AdvancedColumnFamilyOptions.max_write_buffer_number_to_maintain is deleted. It's deprecated since introduction of a better option max_write_buffer_size_to_maintain since RocksDB 6.5.0. +* Deprecated API `DB::MaxMemCompactionLevel()`. +* Deprecated `ReadOptions::ignore_range_deletions`. +* Deprecated API `experimental::PromoteL0()`. +* Added arbitrary string map for additional options to be overridden for remote compactions +* The fail_if_options_file_error option in DBOptions has been removed. The behavior now is to always return failure in any API that fails to persist the OPTIONS file. + +### Behavior Changes +* Make stats `PREFETCH_BYTES_USEFUL`, `PREFETCH_HITS`, `PREFETCH_BYTES` only account for prefetching during user initiated scan + +### Bug Fixes +* Fix a bug in Posix file system that the FSWritableFile created via `FileSystem::ReopenWritableFile` internally does not track the correct file size. +* Fix a bug where tail size of remote compaction output is not persisted in primary db's manifest + +## 10.1.0 (03/24/2025) +### New Features +* Added a new `DBOptions.calculate_sst_write_lifetime_hint_set` setting that allows to customize which compaction styles SST write lifetime hint calculation is allowed on. Today RocksDB supports only two modes `kCompactionStyleLevel` and `kCompactionStyleUniversal`. +* Add a new field `num_l0_files` in `CompactionJobInfo` about the number of L0 files in the CF right before and after the compaction +* Added per-key-placement feature in Remote Compaction +* Implemented API DB::GetPropertiesOfTablesByLevel that retrieves table properties for files in each LSM tree level + +### Public API Changes +* `GetAllKeyVersions()` now interprets empty slices literally, as valid keys, and uses new `OptSlice` type default value for extreme upper and lower range limits. +* `DeleteFilesInRanges()` now takes `RangeOpt` which is based on `OptSlice`. The overload taking `RangePtr` is deprecated. +* Add an unordered map of name/value pairs, ReadOptions::property_bag, to pass opaque options through to an external table when creating an Iterator. +* Introduced CompactionServiceJobStatus::kAborted to allow handling aborted scenario in Schedule(), Wait() or OnInstallation() APIs in Remote Compactions. +* format\_version < 2 in BlockBasedTableOptions is no longer supported for writing new files. Support for reading such files is deprecated and might be removed in the future. `CompressedSecondaryCacheOptions::compress_format_version == 1` is also deprecated. + +### Behavior Changes +* `ldb` now returns an error if the specified `--compression_type` is not supported in the build. +* MultiGet with snapshot and ReadOptions::read_tier = kPersistedTier will now read a consistent view across CFs (instead of potentially reading some CF before and some CF after a flush). +* CreateColumnFamily() is no longer allowed on a read-only DB (OpenForReadOnly()) + +### Bug Fixes +* Fixed stats for Tiered Storage with preclude_last_level feature + ## 10.0.0 (02/21/2025) ### New Features * Introduced new `auto_refresh_iterator_with_snapshot` opt-in knob that (when enabled) will periodically release obsolete memory and storage resources for as long as the iterator is making progress and its supplied `read_options.snapshot` was initialized with non-nullptr value. @@ -119,7 +332,7 @@ * In FIFO compaction, compactions for changing file temperature (configured by option `file_temperature_age_thresholds`) will compact one file at a time, instead of merging multiple eligible file together (#13018). * Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files/link_files and IngestExternalFileOptions::allow_db_generated_files. * Add a new file ingestion option `IngestExternalFileOptions::link_files` to hard link input files and preserve original files links after ingestion. -* DB::Close now untracks files in SstFileManager, making avaialble any space used +* DB::Close now untracks files in SstFileManager, making available any space used by them. Prior to this change they would be orphaned until the DB is re-opened. ### Bug Fixes @@ -315,7 +528,7 @@ MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 1024 thrpt 25 76 * Removed deprecated option `ColumnFamilyOptions::check_flush_compaction_key_order` * Remove the default `WritableFile::GetFileSize` and `FSWritableFile::GetFileSize` implementation that returns 0 and make it pure virtual, so that subclasses are enforced to explicitly provide an implementation. * Removed deprecated option `ColumnFamilyOptions::level_compaction_dynamic_file_size` -* Removed tickers with typos "rocksdb.error.handler.bg.errro.count", "rocksdb.error.handler.bg.io.errro.count", "rocksdb.error.handler.bg.retryable.io.errro.count". +* Removed tickers with typos "rocksdb.error.handler.bg.error.count", "rocksdb.error.handler.bg.io.error.count", "rocksdb.error.handler.bg.retryable.io.error.count". * Remove the force mode for `EnableFileDeletions` API because it is unsafe with no known legitimate use. * Removed deprecated option `ColumnFamilyOptions::ignore_max_compaction_bytes_for_input` * `sst_dump --command=check` now compares the number of records in a table with `num_entries` in table property, and reports corruption if there is a mismatch. API `SstFileDumper::ReadSequential()` is updated to optionally do this verification. (#12322) @@ -342,7 +555,7 @@ MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 1024 thrpt 25 76 * Exposed options ttl via c api. ### Behavior Changes -* `rocksdb.blobdb.blob.file.write.micros` expands to also measure time writing the header and footer. Therefore the COUNT may be higher and values may be smaller than before. For stacked BlobDB, it no longer measures the time of explictly flushing blob file. +* `rocksdb.blobdb.blob.file.write.micros` expands to also measure time writing the header and footer. Therefore the COUNT may be higher and values may be smaller than before. For stacked BlobDB, it no longer measures the time of explicitly flushing blob file. * Files will be compacted to the next level if the data age exceeds periodic_compaction_seconds except for the last level. * Reduced the compaction debt ratio trigger for scheduling parallel compactions * For leveled compaction with default compaction pri (kMinOverlappingRatio), files marked for compaction will be prioritized over files not marked when picking a file from a level for compaction. @@ -407,7 +620,7 @@ want to continue to use force enabling, they need to explicitly pass a `true` to ### Behavior Changes * During off-peak hours defined by `daily_offpeak_time_utc`, the compaction picker will select a larger number of files for periodic compaction. This selection will include files that are projected to expire by the next off-peak start time, ensuring that these files are not chosen for periodic compaction outside of off-peak hours. -* If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occured previously in its status message. +* If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occurred previously in its status message. * Deleting stale files upon recovery are delegated to SstFileManger if available so they can be rate limited. * Make RocksDB only call `TablePropertiesCollector::Finish()` once. * When `WAL_ttl_seconds > 0`, we now process archived WALs for deletion at least every `WAL_ttl_seconds / 2` seconds. Previously it could be less frequent in case of small `WAL_ttl_seconds` values when size-based expiration (`WAL_size_limit_MB > 0 `) was simultaneously enabled. @@ -1195,7 +1408,7 @@ Note: The next release will be major release 7.0. See https://github.com/faceboo ### Public API change * Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes. * Introduce a new EventListener callback that will be called upon the end of automatic error recovery. -* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low seperately. +* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low separately. * Add GetFullHistoryTsLow API so users can query current full_history_low value of specified column family. ### Performance Improvements diff --git a/INSTALL.md b/INSTALL.md index 5bc5bd7b297e..1e739d485d02 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -6,7 +6,7 @@ than release mode. RocksDB's library should be able to compile without any dependency installed, although we recommend installing some compression libraries (see below). -We do depend on newer gcc/clang with C++17 support (GCC >= 7, Clang >= 5). +We do depend on newer gcc/clang with C++20 support (GCC >= 11, Clang >= 10). There are few options when compiling RocksDB: @@ -60,7 +60,7 @@ most processors made since roughly 2013. ## Supported platforms * **Linux - Ubuntu** - * Upgrade your gcc to version at least 7 to get C++17 support. + * Upgrade your gcc to version at least 11 to get C++20 support. * Install gflags. First, try: `sudo apt-get install libgflags-dev` If this doesn't work and you're using Ubuntu, here's a nice tutorial: (http://askubuntu.com/questions/312173/installing-gflags-12-04) @@ -72,7 +72,7 @@ most processors made since roughly 2013. * Install zstandard: `sudo apt-get install libzstd-dev`. * **Linux - CentOS / RHEL** - * Upgrade your gcc to version at least 7 to get C++17 support + * Upgrade your gcc to version at least 11 to get C++20 support * Install gflags: git clone https://github.com/gflags/gflags.git @@ -122,7 +122,7 @@ most processors made since roughly 2013. make && sudo make install * **OS X**: - * Install latest C++ compiler that supports C++ 17: + * Install latest C++ compiler that supports C++20: * Update XCode: run `xcode-select --install` (or install it from XCode App's settting). * Install via [homebrew](http://brew.sh/). * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. @@ -213,7 +213,7 @@ most processors made since roughly 2013. export PATH=/opt/freeware/bin:$PATH * **Solaris Sparc** - * Install GCC 7 and higher. + * Install GCC 11 and higher. * Use these environment variables: export CC=gcc diff --git a/Makefile b/Makefile index 4b1d0414ae3c..40d7437c2f6e 100644 --- a/Makefile +++ b/Makefile @@ -148,10 +148,8 @@ ifeq ($(USE_COROUTINES), 1) USE_FOLLY = 1 # glog/logging.h requires HAVE_CXX11_ATOMIC OPT += -DUSE_COROUTINES -DHAVE_CXX11_ATOMIC - ROCKSDB_CXX_STANDARD = c++2a USE_RTTI = 1 ifneq ($(USE_CLANG), 1) - ROCKSDB_CXX_STANDARD = c++20 PLATFORM_CXXFLAGS += -fcoroutines endif endif @@ -298,6 +296,28 @@ $(info $(shell $(CC) --version)) $(info $(shell $(CXX) --version)) endif +# ccache support +# Set USE_CCACHE=1 to enable ccache, or let it auto-detect +ifndef USE_CCACHE + CCACHE := $(shell which ccache 2>/dev/null) + ifneq ($(CCACHE),) + USE_CCACHE := 1 + else + USE_CCACHE := 0 + endif +endif + +ifeq ($(USE_CCACHE), 1) + CCACHE := $(shell which ccache 2>/dev/null) + ifneq ($(CCACHE),) + $(info Using ccache: $(CCACHE)) + CC := $(CCACHE) $(CC) + CXX := $(CCACHE) $(CXX) + else + $(warning ccache requested but not found in PATH) + endif +endif + missing_make_config_paths := $(shell \ grep "\./\S*\|/\S*" -o $(CURDIR)/make_config.mk | \ while read path; \ @@ -370,8 +390,6 @@ ifdef COMPILE_WITH_TSAN # Turn off -pg when enabling TSAN testing, because that induces # a link failure. TODO: find the root cause PROFILING_FLAGS = - # LUA is not supported under TSAN - LUA_PATH = # Limit keys for crash test under TSAN to avoid error: # "ThreadSanitizer: DenseSlabAllocator overflow. Dying." CRASH_TEST_EXT_ARGS += --max_key=1000000 @@ -448,83 +466,7 @@ else PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR) endif -# This provides a Makefile simulation of a Meta-internal folly integration. -# It is not validated for general use. -# -# USE_FOLLY links the build targets with libfolly.a. The latter could be -# built using 'make build_folly', or built externally and specified in -# the CXXFLAGS and EXTRA_LDFLAGS env variables. The build_detect_platform -# script tries to detect if an external folly dependency has been specified. -# If not, it exports FOLLY_PATH to the path of the installed Folly and -# dependency libraries. -# -# USE_FOLLY_LITE cherry picks source files from Folly to include in the -# RocksDB library. Its faster and has fewer dependencies on 3rd party -# libraries, but with limited functionality. For example, coroutine -# functionality is not available. -ifeq ($(USE_FOLLY),1) -ifeq ($(USE_FOLLY_LITE),1) -$(error Please specify only one of USE_FOLLY and USE_FOLLY_LITE) -endif -ifneq ($(strip $(FOLLY_PATH)),) - BOOST_PATH = $(shell (ls -d $(FOLLY_PATH)/../boost*)) - DBL_CONV_PATH = $(shell (ls -d $(FOLLY_PATH)/../double-conversion*)) - GFLAGS_PATH = $(shell (ls -d $(FOLLY_PATH)/../gflags*)) - GLOG_PATH = $(shell (ls -d $(FOLLY_PATH)/../glog*)) - LIBEVENT_PATH = $(shell (ls -d $(FOLLY_PATH)/../libevent*)) - XZ_PATH = $(shell (ls -d $(FOLLY_PATH)/../xz*)) - LIBSODIUM_PATH = $(shell (ls -d $(FOLLY_PATH)/../libsodium*)) - FMT_PATH = $(shell (ls -d $(FOLLY_PATH)/../fmt*)) - - # For some reason, glog and fmt libraries are under either lib or lib64 - GLOG_LIB_PATH = $(shell (ls -d $(GLOG_PATH)/lib*)) - FMT_LIB_PATH = $(shell (ls -d $(FMT_PATH)/lib*)) - - # AIX: pre-defined system headers are surrounded by an extern "C" block - ifeq ($(PLATFORM), OS_AIX) - PLATFORM_CCFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include - PLATFORM_CXXFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include - else - PLATFORM_CCFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include - PLATFORM_CXXFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include - endif - - # Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later - # in the command line - PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2 $(LIBEVENT_PATH)/lib/libevent-2.1.so -ldl - PLATFORM_LDFLAGS += -Wl,-rpath=$(GFLAGS_PATH)/lib -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(LIBEVENT_PATH)/lib -Wl,-rpath=$(LIBSODIUM_PATH)/lib -Wl,-rpath=$(LIBEVENT_PATH)/lib -endif - PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG - PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG -endif - -ifeq ($(USE_FOLLY_LITE),1) - # Path to the Folly source code and include files - FOLLY_DIR = ./third-party/folly -ifneq ($(strip $(BOOST_SOURCE_PATH)),) - BOOST_INCLUDE = $(shell (ls -d $(BOOST_SOURCE_PATH)/boost*/)) - # AIX: pre-defined system headers are surrounded by an extern "C" block - ifeq ($(PLATFORM), OS_AIX) - PLATFORM_CCFLAGS += -I$(BOOST_INCLUDE) - PLATFORM_CXXFLAGS += -I$(BOOST_INCLUDE) - else - PLATFORM_CCFLAGS += -isystem $(BOOST_INCLUDE) - PLATFORM_CXXFLAGS += -isystem $(BOOST_INCLUDE) - endif -endif # BOOST_SOURCE_PATH - # AIX: pre-defined system headers are surrounded by an extern "C" block - ifeq ($(PLATFORM), OS_AIX) - PLATFORM_CCFLAGS += -I$(FOLLY_DIR) - PLATFORM_CXXFLAGS += -I$(FOLLY_DIR) - else - PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR) - PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR) - endif - PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG - PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG -# TODO: fix linking with fbcode compiler config - PLATFORM_LDFLAGS += -lglog -endif +include folly.mk ifdef TEST_CACHE_LINE_SIZE PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE) @@ -564,32 +506,6 @@ ifndef DISABLE_WARNING_AS_ERROR endif -ifdef LUA_PATH - -ifndef LUA_INCLUDE -LUA_INCLUDE=$(LUA_PATH)/include -endif - -LUA_INCLUDE_FILE=$(LUA_INCLUDE)/lualib.h - -ifeq ("$(wildcard $(LUA_INCLUDE_FILE))", "") -# LUA_INCLUDE_FILE does not exist -$(error Cannot find lualib.h under $(LUA_INCLUDE). Try to specify both LUA_PATH and LUA_INCLUDE manually) -endif -LUA_FLAGS = -I$(LUA_INCLUDE) -DLUA -DLUA_COMPAT_ALL -CFLAGS += $(LUA_FLAGS) -CXXFLAGS += $(LUA_FLAGS) - -ifndef LUA_LIB -LUA_LIB = $(LUA_PATH)/lib/liblua.a -endif -ifeq ("$(wildcard $(LUA_LIB))", "") # LUA_LIB does not exist -$(error $(LUA_LIB) does not exist. Try to specify both LUA_PATH and LUA_LIB manually) -endif -EXEC_LDFLAGS += $(LUA_LIB) - -endif - ifeq ($(NO_THREEWAY_CRC32C), 1) CXXFLAGS += -DNO_THREEWAY_CRC32C endif @@ -638,13 +554,14 @@ endif TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST) BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES)) CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES)) +POINT_LOCK_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(POINT_LOCK_BENCH_LIB_SOURCES)) TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES)) ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES)) STRESS_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES)) # Exclude build_version.cc -- a generated source file -- from all sources. Not needed for dependencies ALL_SOURCES = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc -ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) +ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(POINT_LOCK_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES) ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES) $(ROCKSDB_PLUGIN_TESTS) @@ -659,8 +576,8 @@ ifneq ($(filter check-headers, $(MAKECMDGOALS)),) # TODO: add/support JNI headers DEV_HEADER_DIRS := $(sort include/ $(dir $(ALL_SOURCES))) # Some headers like in port/ are platform-specific - DEV_HEADERS_TO_CHECK := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | grep -E -v 'port/|plugin/|lua/|range_tree/|secondary_index/') - PUBLIC_HEADERS_TO_CHECK := $(shell $(FIND) include/ -type f -name '*.h' | grep -E -v 'lua/') + DEV_HEADERS_TO_CHECK := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | grep -E -v 'port/|plugin/|range_tree/|secondary_index/') + PUBLIC_HEADERS_TO_CHECK := $(shell $(FIND) include/ -type f -name '*.h') else DEV_HEADERS_TO_CHECK := PUBLIC_HEADERS_TO_CHECK := @@ -683,7 +600,8 @@ am__v_CCH_1 = # user build settings %.h.pub: %.h # .h.pub not actually created, so re-checked on each invocation $(AM_V_CCH) cd include/ && echo '#include "$(patsubst include/%,%,$<)"' | \ - $(CXX) -I. -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null + $(CXX) -std=$(or $(ROCKSDB_CXX_STANDARD),c++20) -I. -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null + build_tools/check-public-header.sh $< check-headers: $(HEADER_OK_FILES) @@ -887,7 +805,7 @@ endif # PLATFORM_SHARED_EXT .PHONY: check clean coverage ldb_tests package dbg gen-pc build_size \ release tags tags0 valgrind_check format static_lib shared_lib all \ rocksdbjavastatic rocksdbjava install install-static install-shared \ - uninstall analyze tools tools_lib check-headers checkout_folly + uninstall analyze tools tools_lib check-headers checkout_folly clang-tidy all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS) @@ -1075,6 +993,11 @@ watch-log: dump-log: bash -c '$(quoted_perl_command)' < LOG +# Machine-parseable progress output for automated monitoring (e.g., Claude Code) +# Outputs JSON: {"status":"running","completed":45,"total":100,"failed":0,"percent":45,"eta_seconds":120} +check-progress: + @build_tools/check_progress.sh + # If J != 1 and GNU parallel is installed, run the tests in parallel, # via the check_0 rule above. Otherwise, run them sequentially. check: all @@ -1286,6 +1209,10 @@ tags0: format: build_tools/format-diff.sh +# Non-interactive format (auto-apply without prompts, for CI/automation/Claude Code) +format-auto: + build_tools/format-diff.sh -y + check-format: build_tools/format-diff.sh -c @@ -1295,6 +1222,15 @@ check-buck-targets: check-sources: build_tools/check-sources.sh +# Run clang-tidy on locally changed files, filtered to changed lines only. +# Requires compile_commands.json (generate with cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON). +# Override CLANG_TIDY_BINARY and CLANG_TIDY_JOBS as needed: +# make clang-tidy CLANG_TIDY_BINARY=/usr/bin/clang-tidy CLANG_TIDY_JOBS=8 +CLANG_TIDY_BINARY ?= /opt/homebrew/opt/llvm/bin/clang-tidy +CLANG_TIDY_JOBS ?= $(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) +clang-tidy: + python3 tools/run_clang_tidy.py --clang-tidy-binary $(CLANG_TIDY_BINARY) -j $(CLANG_TIDY_JOBS) + package: bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR) @@ -1345,6 +1281,9 @@ block_cache_trace_analyzer: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_tr cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY) $(AM_LINK) +point_lock_bench: $(OBJ_DIR)/utilities/transactions/lock/point/point_lock_bench.o $(POINT_LOCK_BENCH_OBJECTS) $(LIBRARY) + $(AM_LINK) + persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY) $(AM_LINK) @@ -1357,6 +1296,9 @@ filter_bench: $(OBJ_DIR)/util/filter_bench.o $(LIBRARY) db_stress: $(OBJ_DIR)/db_stress_tool/db_stress.o $(STRESS_LIBRARY) $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) +db_stress_compression_manager: $(OBJ_DIR)/db_stress_tool/db_stress_compression_manager.o $(LIBRARY) + $(AM_LINK) + write_stress: $(OBJ_DIR)/tools/write_stress.o $(LIBRARY) $(AM_LINK) @@ -1422,13 +1364,13 @@ agg_merge_test: $(OBJ_DIR)/utilities/agg_merge/agg_merge_test.o $(TEST_LIBRARY) stringappend_test: $(OBJ_DIR)/utilities/merge_operators/string_append/stringappend_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY) +cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY) +cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY) +cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) cassandra_serialize_test: $(OBJ_DIR)/utilities/cassandra/cassandra_serialize_test.o $(TEST_LIBRARY) $(LIBRARY) @@ -1491,6 +1433,12 @@ db_test: $(OBJ_DIR)/db/db_test.o $(TEST_LIBRARY) $(LIBRARY) db_test2: $(OBJ_DIR)/db/db_test2.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_etc3_test: $(OBJ_DIR)/db/db_etc3_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +compression_test: $(OBJ_DIR)/util/compression_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_logical_block_size_cache_test: $(OBJ_DIR)/db/db_logical_block_size_cache_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1512,6 +1460,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_compaction_abort_test: $(OBJ_DIR)/db/db_compaction_abort_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1875,6 +1826,9 @@ heap_test: $(OBJ_DIR)/util/heap_test.o $(TEST_LIBRARY) $(LIBRARY) point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +point_lock_manager_stress_test: utilities/transactions/lock/point/point_lock_manager_stress_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1884,6 +1838,9 @@ write_committed_transaction_ts_test: $(OBJ_DIR)/utilities/transactions/write_com write_prepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +write_prepared_transaction_test_seqno: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test_seqno.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unprepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1989,6 +1946,9 @@ blob_source_test: $(OBJ_DIR)/db/blob/blob_source_test.o $(TEST_LIBRARY) $(LIBRAR blob_garbage_meter_test: $(OBJ_DIR)/db/blob/blob_garbage_meter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +io_dispatcher_test: $(OBJ_DIR)/util/io_dispatcher_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + timer_test: $(OBJ_DIR)/util/timer_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -2034,6 +1994,9 @@ wide_column_serialization_test: $(OBJ_DIR)/db/wide/wide_column_serialization_tes wide_columns_helper_test: $(OBJ_DIR)/db/wide/wide_columns_helper_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +interval_test: $(OBJ_DIR)/util/interval_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + #------------------------------------------------- # make install related stuff PREFIX ?= /usr/local @@ -2144,14 +2107,14 @@ ZLIB_DOWNLOAD_BASE ?= http://zlib.net BZIP2_VER ?= 1.0.8 BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269 BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2 -SNAPPY_VER ?= 1.2.1 -SNAPPY_SHA256 ?= 736aeb64d86566d2236ddffa2865ee5d7a82d26c9016b36218fcc27ea4f09f86 +SNAPPY_VER ?= 1.2.2 +SNAPPY_SHA256 ?= 90f74bc1fbf78a6c56b3c4a082a05103b3a56bb17bca1a27e052ea11723292dc SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive -LZ4_VER ?= 1.9.4 -LZ4_SHA256 ?= 0b0e3aa07c8c063ddf40b082bdf7e37a1562bda40a0ff5272957f3e987e0e54b +LZ4_VER ?= 1.10.0 +LZ4_SHA256 ?= 537512904744b35e232912055ccf8ec66d768639ff3abe5788d90d792ec5f48b LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive -ZSTD_VER ?= 1.5.5 -ZSTD_SHA256 ?= 98e9c3d949d1b924e28e01eccb7deed865eefebf25c2f21c702e5cd5b63b85e1 +ZSTD_VER ?= 1.5.7 +ZSTD_SHA256 ?= 37d7284556b20954e56e1ca85b80226768902e2edabd3b649e9e72c0c9012ee3 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive CURL_SSL_OPTS ?= --tlsv1 @@ -2242,7 +2205,7 @@ libsnappy.a: snappy-$(SNAPPY_VER).tar.gz -rm -rf snappy-$(SNAPPY_VER) tar xvzf snappy-$(SNAPPY_VER).tar.gz mkdir snappy-$(SNAPPY_VER)/build - cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DSNAPPY_BUILD_BENCHMARKS=OFF -DSNAPPY_BUILD_TESTS=OFF --compile-no-warning-as-error ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET} + cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DSNAPPY_BUILD_BENCHMARKS=OFF -DSNAPPY_BUILD_TESTS=OFF ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET} cp snappy-$(SNAPPY_VER)/build/libsnappy.a . lz4-$(LZ4_VER).tar.gz: @@ -2372,27 +2335,27 @@ rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86: mkdir -p java/target - docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_x86-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerx86_64: mkdir -p java/target - docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_x64-be --platform linux/amd64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerppc64le: mkdir -p java/target - docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_ppc64le-be --platform linux/ppc64le --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerarm64v8: mkdir -p java/target - docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_arm64v8-be --platform linux/aarch64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockers390x: mkdir -p java/target - docker run --rm --name rocksdb_linux_s390x-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_s390x-be --platform linux/s390x --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerriscv64: mkdir -p java/target - docker run --rm --name rocksdb_linux_riscv64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu20_riscv64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_riscv64-be --platform linux/riscv64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu20_riscv64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerx86musl: mkdir -p java/target @@ -2400,19 +2363,19 @@ rocksdbjavastaticdockerx86musl: rocksdbjavastaticdockerx86_64musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_x64-musl-be --platform linux/amd64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerppc64lemusl: mkdir -p java/target - docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_ppc64le-musl-be --platform linux/ppc64le --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockerarm64v8musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_arm64v8-musl-be --platform linux/aarch64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticdockers390xmusl: mkdir -p java/target - docker run --rm --name rocksdb_linux_s390x-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh + docker run --rm --name rocksdb_linux_s390x-musl-be --platform linux/s390x --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral @@ -2467,8 +2430,8 @@ jtest_run: jtest: rocksdbjava cd java;$(MAKE) sample test -jpmd: rocksdbjava rocksdbjavageneratepom - cd java;$(MAKE) pmd +jpmd: rocksdbjavageneratepom + cd java;$(MAKE) java java_test pmd jdb_bench: cd java;$(MAKE) db_bench; @@ -2478,38 +2441,6 @@ commit_prereq: false # J=$(J) build_tools/precommit_checker.py unit clang_unit release clang_release tsan asan ubsan lite unit_non_shm # $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava; -# For public CI runs, checkout folly in a way that can build with RocksDB. -# This is mostly intended as a test-only simulation of Meta-internal folly -# integration. -checkout_folly: - if [ -e third-party/folly ]; then \ - cd third-party/folly && ${GIT_COMMAND} fetch origin; \ - else \ - cd third-party && ${GIT_COMMAND} clone https://github.com/facebook/folly.git; \ - fi - @# Pin to a particular version for public CI, so that PR authors don't - @# need to worry about folly breaking our integration. Update periodically - cd third-party/folly && git reset --hard 78286282478e1ae05b2e8cbcf0e2139eab283bea - @# NOTE: this hack is required for clang in some cases - perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp - @# NOTE: this hack is required for gcc in some cases - perl -pi -e 's/(__has_include..)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h - @# NOTE: boost source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on boost headers - cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py fetch boost - -CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS)) - -build_folly: - FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \ - if [ "$$FOLLY_INST_PATH" ]; then \ - rm -rf $${FOLLY_INST_PATH}/../../*; \ - else \ - echo "Please run checkout_folly first"; \ - false; \ - fi - cd third-party/folly && \ - CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests - # --------------------------------------------------------------------------- # Build size testing # --------------------------------------------------------------------------- @@ -2630,7 +2561,7 @@ list_all_tests: # Remove the rules for which dependencies should not be generated and see if any are left. #If so, include the dependencies; if not, do not include the dependency files -ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test checkout_folly, $(MAKECMDGOALS)) +ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources clang-tidy jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test checkout_folly, $(MAKECMDGOALS)) ifneq ("$(ROCKS_DEP_RULES)", "") -include $(DEPFILES) endif diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index 035254b5ad1f..647353e44f3c 100755 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -135,6 +135,9 @@ def generate_buck(repo_path, deps_map): BUCK = TARGETSBuilder("%s/BUCK" % repo_path, extra_argv) + # Add oncall("rocksdb_point_of_contact") at the top + BUCK.add_oncall("rocksdb_point_of_contact") + # rocksdb_lib BUCK.add_library( "rocksdb_lib", @@ -206,6 +209,12 @@ def generate_buck(repo_path, deps_map): src_mk.get("CACHE_BENCH_LIB_SOURCES", []), [":rocksdb_lib"], ) + # rocksdb_point_lock_bench_tools_lib + BUCK.add_library( + "rocksdb_point_lock_bench_tools_lib", + src_mk.get("POINT_LOCK_BENCH_LIB_SOURCES", []), + [":rocksdb_lib"], + ) # rocksdb_stress_lib BUCK.add_rocksdb_library( "rocksdb_stress_lib", @@ -229,6 +238,12 @@ def generate_buck(repo_path, deps_map): BUCK.add_binary( "cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"] ) + # point_lock_bench binary + BUCK.add_binary( + "point_lock_bench", + ["utilities/transactions/lock/point/point_lock_bench.cc"], + [":rocksdb_point_lock_bench_tools_lib"] + ) # bench binaries for src in src_mk.get("MICROBENCH_SOURCES", []): name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0] diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py index e62eaf958504..1f0f412e18e3 100644 --- a/buckifier/targets_builder.py +++ b/buckifier/targets_builder.py @@ -45,6 +45,11 @@ def __init__(self, path, extra_argv): self.total_bin = 0 self.total_test = 0 self.tests_cfg = "" + + def add_oncall(self, oncall): + with open(self.path, "ab") as targets_file: + targets_file.write(targets_cfg.oncall_template.format(name=oncall).encode("utf-8")) + def add_library( self, diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 4e58d1210200..e9ff129a604a 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -1,10 +1,12 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This source code is licensed under both the GPLv2 (found in the COPYING file in the root directory) +# and the Apache 2.0 License (found in the LICENSE.Apache file in the root directory). rocksdb_target_header_template = """# This file \100generated by: #$ python3 buckifier/buckify_rocksdb.py{extra_argv} # --> DO NOT EDIT MANUALLY <-- -# This file is a Facebook-specific integration for buck builds, so can -# only be validated by Facebook employees. +# This file is a Meta-specific integration for buck builds, so can +# only be validated by Meta employees. load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper") load("@fbcode_macros//build_defs:export_files.bzl", "export_file") @@ -41,3 +43,8 @@ export_file_template = """ export_file(name = "{name}") """ + + +oncall_template = """ +oncall("{name}") +""" diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 629b670b43d6..cfb8d143664b 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -45,18 +45,21 @@ if test -z "$OUTPUT"; then exit 1 fi -# we depend on C++17, but should be compatible with newer standards +# we depend on C++20, but should be compatible with newer standards if [ "$ROCKSDB_CXX_STANDARD" ]; then PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD" else - PLATFORM_CXXFLAGS="-std=c++17" + PLATFORM_CXXFLAGS="-std=c++20" fi # we currently depend on POSIX platform COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX" -# Default to fbcode gcc on internal fb machines -if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then +# Default to fbcode gcc on Meta internal machines +IS_META_HOST="$(hostname | grep -E '(facebook|meta).com|fbinfra.net')" +if [ -z "$ROCKSDB_NO_FBCODE" -a "$IS_META_HOST" ]; then + if [ -d /mnt/gvfs/third-party ]; then + echo "NOTE: Using fbcode build" >&2 FBCODE_BUILD="true" # If we're compiling with TSAN or shared lib, we need pic build PIC_BUILD=$COMPILE_WITH_TSAN @@ -64,6 +67,11 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then PIC_BUILD=1 fi source "$PWD/build_tools/fbcode_config_platform010.sh" + else + echo "************************************************************************" >&2 + echo "WARNING: -d /mnt/gvfs/third-party failed; no fbcode build" >&2 + echo "************************************************************************" >&2 + fi fi # Delete existing output, if it exists @@ -71,7 +79,9 @@ rm -f "$OUTPUT" touch "$OUTPUT" if test -z "$CC"; then - if [ -x "$(command -v cc)" ]; then + if [ "$USE_CLANG" -a -x "$(command -v clang)" ]; then + CC=clang + elif [ -x "$(command -v cc)" ]; then CC=cc elif [ -x "$(command -v clang)" ]; then CC=clang @@ -81,7 +91,9 @@ if test -z "$CC"; then fi if test -z "$CXX"; then - if [ -x "$(command -v g++)" ]; then + if [ "$USE_CLANG" -a -x "$(command -v clang++)" ]; then + CXX=clang++ + elif [ -x "$(command -v g++)" ]; then CXX=g++ elif [ -x "$(command -v clang++)" ]; then CXX=clang++ @@ -91,7 +103,9 @@ if test -z "$CXX"; then fi if test -z "$AR"; then - if [ -x "$(command -v gcc-ar)" ]; then + if [ "$USE_CLANG" -a -x "$(command -v llvm-ar)" ]; then + AR=llvm-ar + elif [ -x "$(command -v gcc-ar)" ]; then AR=gcc-ar elif [ -x "$(command -v llvm-ar)" ]; then AR=llvm-ar @@ -297,7 +311,8 @@ EOF EOF then COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1" - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" + # Hack: don't link extra gflags assuming it comes with folly + [ "$USE_FOLLY" ] || PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" # check if namespace is gflags elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF #include @@ -306,7 +321,8 @@ EOF EOF then COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags" - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" + # Hack: don't link extra gflags assuming it comes with folly + [ "$USE_FOLLY" ] || PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" # check if namespace is google elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF #include @@ -758,6 +774,7 @@ fi if [ "$USE_FOLLY_LITE" ]; then if [ "$FOLLY_DIR" ]; then BOOST_SOURCE_PATH=`cd $FOLLY_DIR && $PYTHON build/fbcode_builder/getdeps.py show-source-dir boost` + FMT_SOURCE_PATH=`cd $FOLLY_DIR && $PYTHON build/fbcode_builder/getdeps.py show-source-dir fmt` fi fi @@ -802,6 +819,7 @@ echo "FIND=$FIND" >> "$OUTPUT" echo "WATCH=$WATCH" >> "$OUTPUT" echo "FOLLY_PATH=$FOLLY_PATH" >> "$OUTPUT" echo "BOOST_SOURCE_PATH=$BOOST_SOURCE_PATH" >> "$OUTPUT" +echo "FMT_SOURCE_PATH=$FMT_SOURCE_PATH" >> "$OUTPUT" # This will enable some related identifiers for the preprocessor if test -n "$JEMALLOC"; then @@ -813,7 +831,6 @@ fi if test -n "$WITH_JEMALLOC_FLAG"; then echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT" fi -echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT" if test -n "$USE_FOLLY"; then echo "USE_FOLLY=$USE_FOLLY" >> "$OUTPUT" fi diff --git a/build_tools/check-public-header.sh b/build_tools/check-public-header.sh new file mode 100755 index 000000000000..bb1bc147dc0a --- /dev/null +++ b/build_tools/check-public-header.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. +# +# Check for some simple mistakes in public headers (on the command line) +# that should prevent commit or push + +BAD="" + +# Look for potential for ODR violations caused by public headers depending on +# build parameters that could vary between RocksDB build and application build. +# * Cases like ROCKSDB_NAMESPACE, and ROCKSDB_ASSERT_STATUS_CHECKED are +# intentional, hard to avoid. (We expect definitions to change and the user +# should also.) +# * Cases like _WIN32, OS_WIN, and __cplusplus are essentially ODR-safe. +# * Cases like +# #ifdef BLAH // ODR-SAFE +# #undef BLAH +# #endif +# that should not cause ODR violations can be exempted with the ODR-SAFE +# marker recognized here. + +grep -nHE '^#if' -- "$@" | grep -vE 'ROCKSDB_NAMESPACE|ROCKSDB_ASSERT_STATUS_CHECKED|_WIN32|OS_WIN|ODR-SAFE|__cplusplus|ROCKSDB_DLL|ROCKSDB_LIBRARY_EXPORTS' +if [ "$?" != "1" ]; then + echo "^^^^^ #if in public API could cause an ODR violation." + echo " Add // ODR-SAFE if verified safe." + BAD=1 +fi + +if [ "$BAD" ]; then + exit 1 +fi diff --git a/build_tools/check_progress.sh b/build_tools/check_progress.sh new file mode 100755 index 000000000000..d52a91dabd05 --- /dev/null +++ b/build_tools/check_progress.sh @@ -0,0 +1,231 @@ +#!/bin/bash +# Output test progress in JSON format for machine parsing +# Usage: build_tools/check_progress.sh + +LOG_FILE="LOG" +T_DIR="t" +SRC_MK="src.mk" + +# Maximum lines of test output to include per failed test +MAX_OUTPUT_LINES=50 + +# Helper to escape string for JSON (handles newlines, quotes, backslashes, tabs) +json_escape() { + local str="$1" + # Use python for reliable JSON escaping if available, otherwise use sed + if command -v python3 &>/dev/null; then + printf '%s' "$str" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read())[1:-1], end="")' + else + printf '%s' "$str" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g; s/\r/\\r/g' | awk '{printf "%s\\n", $0}' | sed 's/\\n$//' + fi +} + +# Helper to output JSON and exit +output_json() { + local status="$1" + local completed="${2:-0}" + local total="${3:-0}" + local failed="${4:-0}" + local percent="${5:-0}" + local eta="${6:-0}" + local avg_time="${7:-0}" + local last_item="${8:-}" + local phase="${9:-}" + local failed_tests="${10:-}" + + # Build JSON output + local json="{\"status\":\"$status\"" + + if [[ -n "$phase" ]]; then + json="$json,\"phase\":\"$phase\"" + fi + + json="$json,\"completed\":$completed,\"total\":$total,\"failed\":$failed,\"percent\":$percent" + json="$json,\"eta_seconds\":$eta,\"avg_time\":\"$avg_time\",\"last_item\":\"$(json_escape "$last_item")\"" + + if [[ -n "$failed_tests" ]]; then + json="$json,\"failed_tests\":[$failed_tests]" + fi + + json="$json}" + echo "$json" +} + +# Get failed test info with log output +get_failed_tests_json() { + local log_file="$1" + local t_dir="$2" + local max_failures=10 + local count=0 + local first=true + + # Get failed tests from LOG file + while IFS=$'\t' read -r seq host starttime runtime send recv exitval signal cmd; do + # Skip header line + [[ "$seq" == "Seq" ]] && continue + + # Check if failed (exitval != 0 or signal != 0) + if [[ "$exitval" != "0" || "$signal" != "0" ]]; then + # Extract test name from command + test_name=$(echo "$cmd" | sed 's,.*/run-,,;s, .*,,') + + # Get log file path + log_path="$t_dir/log-run-$test_name" + + # Read test output (last N lines) + if [[ -f "$log_path" ]]; then + output=$(tail -n "$MAX_OUTPUT_LINES" "$log_path" 2>/dev/null) + else + output="(log file not found: $log_path)" + fi + + # Escape output for JSON + escaped_output=$(json_escape "$output") + + # Build JSON object for this failure + if [[ "$first" == "true" ]]; then + first=false + else + printf "," + fi + printf '{"test":"%s","exit_code":%d,"signal":%d,"output":"%s"}' \ + "$test_name" "$exitval" "$signal" "$escaped_output" + + ((count++)) + if [[ $count -ge $max_failures ]]; then + break + fi + fi + done < "$log_file" +} + +# Check if tests are running (LOG file exists) +if [[ -f "$LOG_FILE" ]]; then + # Count total tests from t/run-* files + if [[ -d "$T_DIR" ]]; then + total=$(find "$T_DIR" -name 'run-*' -type f 2>/dev/null | wc -l) + else + total=0 + fi + + # If no parallel tests generated yet + if [[ "$total" -eq 0 ]]; then + output_json "running" 0 0 0 0 0 "0" "" "generating" + exit 0 + fi + + # Parse LOG file (skip header line) + # LOG format: Seq Host Starttime JobRuntime Send Receive Exitval Signal Command + completed=$(tail -n +2 "$LOG_FILE" 2>/dev/null | wc -l) + + # Count failures + failed=$(awk -F'\t' 'NR>1 && ($7 != 0 || $8 != 0) {count++} END {print count+0}' "$LOG_FILE" 2>/dev/null) + + # Get failed tests JSON with output (only if there are failures) + if [[ "$failed" -gt 0 ]]; then + failed_tests=$(get_failed_tests_json "$LOG_FILE" "$T_DIR") + else + failed_tests="" + fi + + # Calculate percentage + if [[ "$total" -gt 0 ]]; then + percent=$((completed * 100 / total)) + else + percent=0 + fi + + # Get last completed test name (extract from command column) + last_test=$(tail -1 "$LOG_FILE" 2>/dev/null | awk -F'\t' '{print $9}' | sed 's,.*/run-,,;s, .*,,;s,^./,,') + + # Calculate ETA based on average time + if [[ "$completed" -gt 0 ]]; then + avg_time=$(awk -F'\t' 'NR>1 {sum+=$4; count++} END {if(count>0) printf "%.1f", sum/count; else print "0"}' "$LOG_FILE") + remaining=$((total - completed)) + eta=$(awk "BEGIN {printf \"%.0f\", $avg_time * $remaining}") + else + avg_time="0" + eta="0" + fi + + # Determine status + if [[ "$completed" -ge "$total" ]]; then + status="completed" + elif [[ "$completed" -gt 0 ]]; then + status="running" + else + status="starting" + fi + + output_json "$status" "$completed" "$total" "$failed" "$percent" "$eta" "$avg_time" "$last_test" "testing" "$failed_tests" + exit 0 +fi + +# No LOG file - check if we're in compilation/linking phase +# Count expected source files from src.mk +if [[ -f "$SRC_MK" ]]; then + # Count LIB_SOURCES (library object files to compile) + expected_lib_objects=$(grep -E '\.cc\s*\\?$' "$SRC_MK" | grep -v '^#' | wc -l) + + # Count TEST_MAIN_SOURCES (test binaries to link) + expected_test_binaries=$(sed -n '/^TEST_MAIN_SOURCES =/,/^[^ ]/p' "$SRC_MK" | grep -cE '\.cc\s*\\?$' 2>/dev/null || echo 0) +else + expected_lib_objects=0 + expected_test_binaries=0 +fi + +# Check for test generation phase (t/ directory being created) +if [[ -d "$T_DIR" ]]; then + total=$(find "$T_DIR" -name 'run-*' -type f 2>/dev/null | wc -l) + if [[ "$total" -gt 0 ]]; then + output_json "running" 0 "$total" 0 0 0 "0" "" "generating" + exit 0 + fi +fi + +# Count compiled object files (in subdirectories matching source structure) +# Object files are created as dir/file.o (e.g., cache/cache.o, db/db_impl.o) +compiled_objects=0 +if [[ "$expected_lib_objects" -gt 0 ]]; then + # Count .o files in source directories + compiled_objects=$(find cache db env file logging memory memtable monitoring options port table test_util trace_replay util utilities -name '*.o' -type f 2>/dev/null | wc -l) +fi + +# Count linked test binaries (test binaries are in current directory with _test suffix) +linked_tests=0 +if [[ "$expected_test_binaries" -gt 0 ]]; then + linked_tests=$(find . -maxdepth 1 -name '*_test' -type f -executable 2>/dev/null | wc -l) +fi + +# Determine phase based on what exists +if [[ "$compiled_objects" -eq 0 && "$linked_tests" -eq 0 ]]; then + # Nothing compiled yet - not started or just beginning + output_json "not_started" 0 0 0 0 0 "0" "" + exit 0 +fi + +# Calculate total work units: compiling + linking +total_work=$((expected_lib_objects + expected_test_binaries)) +completed_work=$((compiled_objects + linked_tests)) + +if [[ "$total_work" -gt 0 ]]; then + percent=$((completed_work * 100 / total_work)) +else + percent=0 +fi + +# Determine phase +if [[ "$compiled_objects" -lt "$expected_lib_objects" ]]; then + phase="compiling" + # Get most recently modified .o file as last_item + last_item=$(find cache db env file logging memory memtable monitoring options port table test_util trace_replay util utilities -name '*.o' -type f -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2- | sed 's,^\./,,;s,\.o$,,') +elif [[ "$linked_tests" -lt "$expected_test_binaries" ]]; then + phase="linking" + # Get most recently modified test binary as last_item + last_item=$(find . -maxdepth 1 -name '*_test' -type f -executable -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2- | sed 's,^\./,,') +else + phase="generating" + last_item="" +fi + +output_json "running" "$completed_work" "$total_work" 0 "$percent" 0 "0" "$last_item" "$phase" diff --git a/build_tools/dependencies_platform010.sh b/build_tools/dependencies_platform010.sh index 9b19a801c85f..a55663cb25da 100644 --- a/build_tools/dependencies_platform010.sh +++ b/build_tools/dependencies_platform010.sh @@ -19,4 +19,3 @@ BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/780c7a0f9cf0967961e69ad08e61cddd KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/624a2f8f6c93c3c1df8aa4a6255d8202631a6c80/fb/platform010/da39a3e BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/39579e8603b48b3540f8b0633f43adf29acccb8b/2.37/centos8-native/da39a3e VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/cd9cc656d49ecb53797ce4d055e49fde29fd57ff/3.19.0/platform010/76ebdda -LUA_BASE=/mnt/gvfs/third-party2/lua/363787fa5cac2a8aa20638909210443278fa138e/5.3.4/platform010/9079c97 diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh index 02732bde3d1c..802e757795c7 100644 --- a/build_tools/fbcode_config.sh +++ b/build_tools/fbcode_config.sh @@ -164,12 +164,4 @@ EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GF VALGRIND_VER="$VALGRIND_BASE/bin/" -LUA_PATH="$LUA_BASE" - -if test -z $PIC_BUILD; then - LUA_LIB=" $LUA_PATH/lib/liblua.a" -else - LUA_LIB=" $LUA_PATH/lib/liblua_pic.a" -fi - -export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB +export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD diff --git a/build_tools/fbcode_config_platform010.sh b/build_tools/fbcode_config_platform010.sh index 87a28b4f92d0..0fc99ecad159 100644 --- a/build_tools/fbcode_config_platform010.sh +++ b/build_tools/fbcode_config_platform010.sh @@ -172,4 +172,4 @@ EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GF VALGRIND_VER="$VALGRIND_BASE/bin/" -export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB +export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh index 9dc85496c91b..55ee4bd6e24f 100755 --- a/build_tools/format-diff.sh +++ b/build_tools/format-diff.sh @@ -7,14 +7,18 @@ print_usage () { echo "Usage:" echo "format-diff.sh [OPTIONS]" echo "-c: check only." + echo "-y: auto-apply formatting without prompts (non-interactive mode)." echo "-h: print this message." } -while getopts ':ch' OPTION; do +while getopts ':cyh' OPTION; do case "$OPTION" in c) CHECK_ONLY=1 ;; + y) + AUTO_APPLY=1 + ;; h) print_usage exit 1 @@ -118,6 +122,9 @@ fi # fi set -e +# Exclude third-party from formatting +EXCLUDE=':!third-party/' + uncommitted_code=`git diff HEAD` # If there's no uncommitted changes, we assume user are doing post-commit @@ -137,14 +144,78 @@ then # should be relevant for formatting fixes. FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)" # Get the differences - diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1) || true + diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" -- $EXCLUDE | $CLANG_FORMAT_DIFF -p 1) || true echo "Checking format of changes not yet in $FORMAT_UPSTREAM..." else # Check the format of uncommitted lines, - diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) || true + diffs=$(git diff -U0 HEAD -- $EXCLUDE | $CLANG_FORMAT_DIFF -p 1) || true echo "Checking format of uncommitted changes..." fi +# Check for missing copyright in new files +echo "Checking for copyright headers in new files..." + +# Get list of new files (added, not just modified) +if [ -z "$uncommitted_code" ]; then + # Post-commit: check files added since merge base + new_files=$(git diff --name-only --diff-filter=A "$FORMAT_UPSTREAM_MERGE_BASE" -- '*.h' '*.cc' '*.py' $EXCLUDE) +else + # Pre-commit: check staged new files + new_files=$(git diff --name-only --diff-filter=A --cached HEAD -- '*.h' '*.cc' '*.py' $EXCLUDE) +fi + +if [ -n "$new_files" ]; then + files_missing_copyright="" + + for file in $new_files; do + if [ -f "$file" ]; then + # Check if file is missing copyright + # For .py files, check for Python-style comment + # For .h and .cc files, check for C++-style comment + if [[ "$file" == *.py ]]; then + if ! grep -q "Copyright (c) Meta Platforms, Inc. and affiliates" "$file"; then + files_missing_copyright="$files_missing_copyright $file" + # Add copyright header to Python file + temp_file=$(mktemp) + { + echo "# Copyright (c) Meta Platforms, Inc. and affiliates." + echo "# This source code is licensed under both the GPLv2 (found in the COPYING file in the root directory)" + echo "# and the Apache 2.0 License (found in the LICENSE.Apache file in the root directory)." + echo + cat "$file" + } > "$temp_file" + mv "$temp_file" "$file" + echo "Added copyright header to $file" + fi + elif [[ "$file" == *.h ]] || [[ "$file" == *.cc ]]; then + if ! grep -q "Copyright (c) Meta Platforms, Inc. and affiliates" "$file"; then + files_missing_copyright="$files_missing_copyright $file" + # Add copyright header to C++ file + temp_file=$(mktemp) + { + echo "// Copyright (c) Meta Platforms, Inc. and affiliates. " + echo "// This source code is licensed under both the GPLv2 (found in the " + echo "// COPYING file in the root directory) and Apache 2.0 License " + echo "// (found in the LICENSE.Apache file in the root directory)." + echo + cat "$file" + } > "$temp_file" + mv "$temp_file" "$file" + echo "Added copyright header to $file" + fi + fi + fi + done + + if [ -n "$files_missing_copyright" ]; then + echo "Copyright headers were added to new files." + else + echo "All new files have copyright headers." + fi +else + echo "No new files to check for copyright headers." +fi + if [ -z "$diffs" ] then echo "Nothing needs to be reformatted!" @@ -173,11 +244,16 @@ echo "$diffs" | sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" | sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/" -echo -e "Would you like to fix the format automatically (y/n): \c" +# Handle auto-apply mode (non-interactive) +if [ "$AUTO_APPLY" ]; then + to_fix="y" +else + echo -e "Would you like to fix the format automatically (y/n): \c" -# Make sure under any mode, we can read user input. -exec < /dev/tty -read to_fix + # Make sure under any mode, we can read user input. + exec < /dev/tty + read to_fix +fi if [ "$to_fix" != "y" ] then @@ -187,14 +263,15 @@ fi # Do in-place format adjustment. if [ -z "$uncommitted_code" ] then - git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1 + git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" -- $EXCLUDE | $CLANG_FORMAT_DIFF -i -p 1 else - git diff -U0 HEAD | $CLANG_FORMAT_DIFF -i -p 1 + git diff -U0 HEAD -- $EXCLUDE | $CLANG_FORMAT_DIFF -i -p 1 fi echo "Files reformatted!" # Amend to last commit if user do the post-commit format check -if [ -z "$uncommitted_code" ]; then +# Skip amend prompt in auto-apply mode (user can amend manually if desired) +if [ -z "$uncommitted_code" ] && [ -z "$AUTO_APPLY" ]; then echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c" read to_amend diff --git a/build_tools/getdeps_fallback_mirror.py b/build_tools/getdeps_fallback_mirror.py new file mode 100644 index 000000000000..7b3bb31b584d --- /dev/null +++ b/build_tools/getdeps_fallback_mirror.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Pre-download packages with unreliable mirrors using fallback mirrors. +Reads package info from folly's getdeps manifest files. +""" +import sys +import os +import hashlib +import subprocess +import configparser + +def sha256_file(path): + """Calculate SHA256 hash of a file.""" + h = hashlib.sha256() + try: + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(65536), b''): + h.update(chunk) + return h.hexdigest() + except Exception: + return None + +def parse_manifest(manifest_path): + """Parse a getdeps manifest file to extract download info.""" + config = configparser.ConfigParser() + try: + config.read(manifest_path) + if 'download' in config: + return { + 'url': config['download'].get('url', ''), + 'sha256': config['download'].get('sha256', ''), + } + except Exception: + pass + return None + +def get_fallback_mirrors(url): + """Get fallback mirror URLs for a given URL.""" + # Fallback mirror patterns for known unreliable hosts + mirror_fallbacks = { + "ftp.gnu.org/gnu/": [ + "https://mirrors.kernel.org/gnu/", + "https://ftpmirror.gnu.org/gnu/", + "https://ftp.gnu.org/gnu/", + ], + "ftpmirror.gnu.org/gnu/": [ + "https://mirrors.kernel.org/gnu/", + "https://ftpmirror.gnu.org/gnu/", + "https://ftp.gnu.org/gnu/", + ], + } + + for pattern, mirrors in mirror_fallbacks.items(): + if pattern in url: + # Extract the path after the pattern + path_start = url.find(pattern) + len(pattern) + path = url[path_start:] + return [mirror + path for mirror in mirrors] + return [url] # No fallback, use original + +def main(): + if len(sys.argv) != 4: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + + download_dir, cache_dir, manifests_dir = sys.argv[1], sys.argv[2], sys.argv[3] + + # Packages known to have unreliable mirrors + packages_to_check = ["autoconf", "automake", "libtool"] + + for package in packages_to_check: + manifest_path = os.path.join(manifests_dir, package) + if not os.path.exists(manifest_path): + continue + + info = parse_manifest(manifest_path) + if not info or not info['url'] or not info['sha256']: + continue + + # Determine filename from URL + url = info['url'] + expected_sha256 = info['sha256'] + url_filename = os.path.basename(url) + + # getdeps uses format: {package}-{filename} + filename = f"{package}-{url_filename}" + filepath = os.path.join(download_dir, filename) + cache_path = os.path.join(cache_dir, filename) + + # Check if already valid + if os.path.exists(filepath) and sha256_file(filepath) == expected_sha256: + print(f" {filename}: OK (already downloaded)") + continue + + # Check cache + if os.path.exists(cache_path) and sha256_file(cache_path) == expected_sha256: + print(f" {filename}: OK (from cache)") + subprocess.run(['cp', cache_path, filepath], check=True) + continue + + # Try fallback mirrors + mirrors = get_fallback_mirrors(url) + downloaded = False + for mirror_url in mirrors: + print(f" {filename}: trying {mirror_url}...") + try: + subprocess.run(['wget', '-q', '-O', filepath, mirror_url], check=True, timeout=120) + if sha256_file(filepath) == expected_sha256: + print(f" {filename}: OK (downloaded)") + subprocess.run(['cp', filepath, cache_path], check=False) + downloaded = True + break + else: + os.remove(filepath) + except Exception: + if os.path.exists(filepath): + os.remove(filepath) + + if not downloaded: + print(f" {filename}: WARNING - all mirrors failed") + +if __name__ == "__main__": + main() diff --git a/build_tools/ubuntu22_image/Dockerfile b/build_tools/ubuntu22_image/Dockerfile new file mode 100644 index 000000000000..cb627f33daa7 --- /dev/null +++ b/build_tools/ubuntu22_image/Dockerfile @@ -0,0 +1,88 @@ +# INSTRUCTIONS: +# I was not able to build docker images on an isolated devserver because of +# issues with proxy internet access. Use a public cloud or other Linux system. +# (I used a Debian system after installing docker features, adding my user to +# the docker and docker-registry groups, and logging out and back in to pick +# those up.) +# +# Follow https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-with-a-personal-access-token-classic +# to login with your GitHub credentials, as in +# +# $ docker login ghcr.io -u pdillinger +# +# and paste the limited-purpose GitHub token into the terminal. +# +# Then in the build_tools/ubuntu22_image directory, (bump minor version for +# random docker file updates, major version tracks Ubuntu release) +# +# $ docker build -t ghcr.io/facebook/rocksdb_ubuntu:22.0 +# $ docker push ghcr.io/facebook/rocksdb_ubuntu:22.0 +# +# Might need to change visibility to public through +# https://github.com/orgs/facebook/packages/container/rocksdb_ubuntu/settings +# or similar. + +# from official ubuntu 22.04 +FROM ubuntu:22.04 +# update system +RUN apt-get update +RUN apt-get upgrade -y +# install basic tools +RUN apt-get install -y vim wget curl +# install tzdata noninteractive +RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata +# install git and default compilers +RUN apt-get install -y git gcc g++ clang clang-tools +# install basic package +RUN apt-get install -y lsb-release software-properties-common gnupg +# install gflags, tbb +RUN apt-get install -y libgflags-dev libtbb-dev +# install compression libs +RUN apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev +# install cmake +RUN apt-get install -y cmake +RUN apt-get install -y libssl-dev +# install clang-13 +WORKDIR /root +RUN wget https://apt.llvm.org/llvm.sh +RUN chmod +x llvm.sh +RUN ./llvm.sh 13 all +# There are incompatibilities between clang with -std=c++20 and libstdc++ +# provided by gcc, so we have to compile with clang-13 using -stdlib=libc++ +# and only one version of libc++ can be installed on the system at one time. +# So to avoid confusion we remove unusable clang-14 also. +RUN apt-get install libc++-13-dev libc++abi-13-dev +RUN apt-get purge -y clang-14 && apt-get autoremove -y + +# install gcc-10 and more, default is 11 +RUN apt-get install -y gcc-10 g++-10 +RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test +RUN apt-get install -y gcc-13 g++-13 +# install apt-get install -y valgrind +RUN apt-get install -y valgrind +# install folly depencencies +# Missing compatible libunwind: RUN apt-get install -y libgoogle-glog-dev +# So instead install from source. This currently requires compiling with +# -DGLOG_USE_GLOG_EXPORT +RUN wget https://github.com/google/glog/archive/refs/tags/v0.7.1.tar.gz && tar xzf v0.7.1.tar.gz && cd glog-0.7.1/ && cmake -S . -B build -G "Unix Makefiles" && cmake --build build && cmake --build build --target install && cd .. && rm -rf v0.7.1.tar.gz glog-0.7.1 +# install openjdk 8 +RUN apt-get install -y openjdk-8-jdk +ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 +# install mingw +RUN apt-get install -y mingw-w64 + +# install gtest-parallel package +RUN git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel +ENV PATH $PATH:/root/gtest-parallel + +# install libprotobuf for fuzzers test +RUN apt-get install -y ninja-build binutils liblzma-dev libz-dev pkg-config autoconf libtool +RUN git clone --branch v1.0 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator && cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build && cmake .. -GNinja -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang++-13 -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON && ninja && ninja install +ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/ +ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc + +# install the latest google benchmark +RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark && cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install && cd ~ && rm -rf /root/benchmark + +# clean up +RUN rm -rf /var/lib/apt/lists/* diff --git a/build_tools/ubuntu24_image/Dockerfile b/build_tools/ubuntu24_image/Dockerfile new file mode 100644 index 000000000000..0f7e98ca6e9f --- /dev/null +++ b/build_tools/ubuntu24_image/Dockerfile @@ -0,0 +1,72 @@ +# INSTRUCTIONS: +# I was not able to build docker images on an isolated devserver because of +# issues with proxy internet access. Use a public cloud or other Linux system. +# (I used a Debian system after installing docker features, adding my user to +# the docker and docker-registry groups, and logging out and back in to pick +# those up.) +# +# Follow https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-with-a-personal-access-token-classic +# to login with your GitHub credentials, as in +# +# $ docker login ghcr.io -u pdillinger +# +# and paste the limited-purpose GitHub token into the terminal. +# +# Then in the build_tools/ubuntu24_image directory, (bump minor version for +# random docker file updates, major version tracks Ubuntu release) +# +# $ docker build -t ghcr.io/facebook/rocksdb_ubuntu:24.0 +# $ docker push ghcr.io/facebook/rocksdb_ubuntu:24.0 +# +# Might need to change visibility to public through +# https://github.com/orgs/facebook/packages/container/rocksdb_ubuntu/settings +# or similar. + +# from official ubuntu 24.04 +FROM ubuntu:24.04 +# update system +RUN apt-get update +RUN apt-get upgrade -y +# install basic tools +RUN apt-get install -y vim wget curl +# install tzdata noninteractive +RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata +# install git and default compilers +RUN apt-get install -y git gcc g++ clang clang-tools +# install basic package +RUN apt-get install -y lsb-release software-properties-common gnupg +# install gflags, tbb +RUN apt-get install -y libgflags-dev libtbb-dev +# install compression libs +RUN apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev +# install cmake +RUN apt-get install -y cmake +RUN apt-get install -y libssl-dev + +# install gcc-12 and more, default is 13 +RUN apt-get install -y gcc-12 g++-12 gcc-14 g++-14 +# install apt-get install -y valgrind +RUN apt-get install -y valgrind +# install folly depencencies +RUN apt-get install -y libgoogle-glog-dev +# install openjdk 8 +RUN apt-get install -y openjdk-8-jdk +ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 +# install mingw +RUN apt-get install -y mingw-w64 + +# install gtest-parallel package +RUN git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel +ENV PATH $PATH:/root/gtest-parallel + +# install libprotobuf for fuzzers test +RUN apt-get install -y ninja-build binutils liblzma-dev libz-dev pkg-config autoconf libtool +RUN git clone --branch v1.0 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator && cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build && cmake .. -GNinja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON && ninja && ninja install +ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/ +ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc + +# install the latest google benchmark +RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark && cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install && cd ~ && rm -rf /root/benchmark + +# clean up +RUN rm -rf /var/lib/apt/lists/* diff --git a/build_tools/update_dependencies.sh b/build_tools/update_dependencies.sh index afc39ab8009a..6584cd6edaca 100755 --- a/build_tools/update_dependencies.sh +++ b/build_tools/update_dependencies.sh @@ -101,6 +101,5 @@ get_lib_base benchmark LATEST platform010 get_lib_base kernel-headers fb platform010 get_lib_base binutils LATEST centos8-native get_lib_base valgrind LATEST platform010 -get_lib_base lua 5.3.4 platform010 git diff $OUTPUT diff --git a/cache/cache.cc b/cache/cache.cc index 3556f61243e9..f94a379d200c 100644 --- a/cache/cache.cc +++ b/cache/cache.cc @@ -54,11 +54,6 @@ static std::unordered_map {offsetof(struct CompressedSecondaryCacheOptions, compression_type), OptionType::kCompressionType, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, - {"compress_format_version", - {offsetof(struct CompressedSecondaryCacheOptions, - compress_format_version), - OptionType::kUInt32T, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable}}, {"enable_custom_split_merge", {offsetof(struct CompressedSecondaryCacheOptions, enable_custom_split_merge), diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index a5e589f4689f..7b62fbae662a 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -60,6 +60,8 @@ DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added."); DEFINE_uint32(value_bytes_estimate, 0, "If > 0, overrides estimated_entry_charge or " "min_avg_entry_charge depending on cache_type."); +DEFINE_double(compressible_to_ratio, 0.5, + "Approximate size ratio that values can be compressed to."); DEFINE_int32( degenerate_hash_bits, 0, @@ -117,7 +119,7 @@ DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random"); DEFINE_string(secondary_cache_uri, "", "Full URI for creating a custom secondary cache object"); -DEFINE_string(cache_type, "lru_cache", "Type of block cache."); +DEFINE_string(cache_type, "hyper_clock_cache", "Type of block cache."); DEFINE_bool(use_jemalloc_no_dump_allocator, false, "Whether to use JemallocNoDumpAllocator"); @@ -182,6 +184,11 @@ DEFINE_bool(sck_randomize, false, DEFINE_bool(sck_footer_unique_id, false, "(-stress_cache_key) Simulate using proposed footer unique id"); // ## END stress_cache_key sub-tool options ## +// ## BEGIN stress_cache_instances sub-tool options ## +DEFINE_uint32(stress_cache_instances, 0, + "If > 0, run cache instance stress test instead"); +// Uses cache_size and cache_type, maybe more +// ## END stress_cache_instance sub-tool options ## namespace ROCKSDB_NAMESPACE { @@ -291,10 +298,19 @@ struct KeyGen { Cache::ObjectPtr createValue(Random64& rnd, MemoryAllocator* alloc) { char* rv = AllocateBlock(FLAGS_value_bytes, alloc).release(); - // Fill with some filler data, and take some CPU time - for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) { + // Fill with some filler data, and take some CPU time, but add redundancy + // as requested for compressibility. + uint32_t random_fill_size = std::max( + uint32_t{1}, std::min(FLAGS_value_bytes, + static_cast(FLAGS_compressible_to_ratio * + FLAGS_value_bytes))); + uint32_t i = 0; + for (; i < random_fill_size; i += 8) { EncodeFixed64(rv + i, rnd.Next()); } + for (; i < FLAGS_value_bytes; i++) { + rv[i] = rv[i % random_fill_size]; + } return rv; } @@ -309,16 +325,16 @@ Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/, Status CreateFn(const Slice& data, CompressionType /*type*/, CacheTier /*source*/, Cache::CreateContext* /*context*/, - MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj, + MemoryAllocator* alloc, Cache::ObjectPtr* out_obj, size_t* out_charge) { - *out_obj = new char[data.size()]; + *out_obj = AllocateBlock(data.size(), alloc).release(); memcpy(*out_obj, data.data(), data.size()); *out_charge = data.size(); return Status::OK(); }; void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* alloc) { - CustomDeleter{alloc}(static_cast(value)); + CacheAllocationDeleter{alloc}(static_cast(value)); } Cache::CacheItemHelper helper1_wos(CacheEntryRole::kDataBlock, DeleteFn); @@ -376,7 +392,12 @@ class CacheBench { fprintf(stderr, "Percentages must add to 100.\n"); exit(1); } + cache_ = MakeCache(); + } + + ~CacheBench() = default; + static std::shared_ptr MakeCache() { std::shared_ptr allocator; if (FLAGS_use_jemalloc_no_dump_allocator) { JemallocAllocatorOptions opts; @@ -395,12 +416,12 @@ class CacheBench { opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX); opts.memory_allocator = allocator; opts.eviction_effort_cap = FLAGS_eviction_effort_cap; - if (FLAGS_cache_type == "fixed_hyper_clock_cache" || - FLAGS_cache_type == "hyper_clock_cache") { + if (FLAGS_cache_type == "fixed_hyper_clock_cache") { opts.estimated_entry_charge = FLAGS_value_bytes_estimate > 0 ? FLAGS_value_bytes_estimate : FLAGS_value_bytes; - } else if (FLAGS_cache_type == "auto_hyper_clock_cache") { + } else if (FLAGS_cache_type == "auto_hyper_clock_cache" || + FLAGS_cache_type == "hyper_clock_cache") { if (FLAGS_value_bytes_estimate > 0) { opts.min_avg_entry_charge = FLAGS_value_bytes_estimate; } @@ -409,7 +430,7 @@ class CacheBench { exit(1); } ConfigureSecondaryCache(opts); - cache_ = opts.MakeSharedCache(); + return opts.MakeSharedCache(); } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false /* strict_capacity_limit */, @@ -417,15 +438,13 @@ class CacheBench { opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX); opts.memory_allocator = allocator; ConfigureSecondaryCache(opts); - cache_ = NewLRUCache(opts); + return NewLRUCache(opts); } else { fprintf(stderr, "Cache type not supported.\n"); exit(1); } } - ~CacheBench() = default; - void PopulateCache() { Random64 rnd(FLAGS_seed); KeyGen keygen; @@ -479,7 +498,7 @@ class CacheBench { PrintEnv(); SharedState shared(this); - std::vector > threads(FLAGS_threads); + std::vector> threads(FLAGS_threads); for (uint32_t i = 0; i < FLAGS_threads; i++) { threads[i].reset(new ThreadState(i, &shared)); std::thread(ThreadBody, threads[i].get()).detach(); @@ -1141,6 +1160,59 @@ class StressCacheKey { double multiplier_ = 0.0; }; +// cache_bench -stress_cache_instances is a partially independent embedded tool +// for evaluating the time and space required to create and destroy many cache +// instances, as this is considered important for a default cache implementation +// which could see many throw-away instances in handling of Options, or created +// in large numbers for many very small DBs with many CFs. Prefix command line +// with /usr/bin/time to see max RSS memory. +class StressCacheInstances { + public: + void Run() { + const int kNumIterations = 10; + const auto clock = SystemClock::Default().get(); + caches_.reserve(FLAGS_stress_cache_instances); + + uint64_t total_create_time_us = 0; + uint64_t total_destroy_time_us = 0; + + for (int iter = 0; iter < kNumIterations; ++iter) { + // Create many cache instances + uint64_t start_create = clock->NowMicros(); + for (uint32_t i = 0; i < FLAGS_stress_cache_instances; ++i) { + caches_.emplace_back(CacheBench::MakeCache()); + } + uint64_t end_create = clock->NowMicros(); + uint64_t create_time = end_create - start_create; + total_create_time_us += create_time; + + // Destroy them + uint64_t start_destroy = clock->NowMicros(); + caches_.clear(); + uint64_t end_destroy = clock->NowMicros(); + uint64_t destroy_time = end_destroy - start_destroy; + total_destroy_time_us += destroy_time; + + printf( + "Iteration %d: Created %u caches in %.3f ms, destroyed in %.3f ms\n", + iter + 1, FLAGS_stress_cache_instances, create_time / 1000.0, + destroy_time / 1000.0); + } + + printf("Average creation time: %.3f ms (%.1f us per cache)\n", + static_cast(total_create_time_us) / kNumIterations / 1000.0, + static_cast(total_create_time_us) / kNumIterations / + FLAGS_stress_cache_instances); + printf("Average destruction time: %.3f ms (%.1f us per cache)\n", + static_cast(total_destroy_time_us) / kNumIterations / 1000.0, + static_cast(total_destroy_time_us) / kNumIterations / + FLAGS_stress_cache_instances); + } + + private: + std::vector> caches_; +}; + int cache_bench_tool(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ParseCommandLineFlags(&argc, &argv, true); @@ -1151,6 +1223,11 @@ int cache_bench_tool(int argc, char** argv) { return 0; } + if (FLAGS_stress_cache_instances > 0) { + StressCacheInstances().Run(); + return 0; + } + if (FLAGS_threads <= 0) { fprintf(stderr, "threads number <= 0\n"); exit(1); diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h index 9968995da95a..f8c5e422e896 100644 --- a/cache/cache_entry_stats.h +++ b/cache/cache_entry_stats.h @@ -101,23 +101,23 @@ class CacheEntryStatsCollector { } // Gets saved stats, regardless of age - void GetStats(Stats *stats) { + void GetStats(Stats* stats) { std::lock_guard lock(saved_mutex_); *stats = saved_stats_; } - Cache *GetCache() const { return cache_; } + Cache* GetCache() const { return cache_; } // Gets or creates a shared instance of CacheEntryStatsCollector in the // cache itself, and saves into `ptr`. This shared_ptr will hold the // entry in cache until all refs are destroyed. - static Status GetShared(Cache *raw_cache, SystemClock *clock, - std::shared_ptr *ptr) { + static Status GetShared(Cache* raw_cache, SystemClock* clock, + std::shared_ptr* ptr) { assert(raw_cache); BasicTypedCacheInterface cache{raw_cache}; - const Slice &cache_key = GetCacheKey(); + const Slice& cache_key = GetCacheKey(); auto h = cache.Lookup(cache_key); if (h == nullptr) { // Not yet in cache, but Cache doesn't provide a built-in way to @@ -152,7 +152,7 @@ class CacheEntryStatsCollector { } private: - explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock) + explicit CacheEntryStatsCollector(Cache* cache, SystemClock* clock) : saved_stats_(), working_stats_(), last_start_time_micros_(0), @@ -160,7 +160,7 @@ class CacheEntryStatsCollector { cache_(cache), clock_(clock) {} - static const Slice &GetCacheKey() { + static const Slice& GetCacheKey() { // For each template instantiation static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime(); static Slice ckey_slice = ckey.AsSlice(); @@ -175,8 +175,8 @@ class CacheEntryStatsCollector { uint64_t last_start_time_micros_; uint64_t last_end_time_micros_; - Cache *const cache_; - SystemClock *const clock_; + Cache* const cache_; + SystemClock* const clock_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_key.cc b/cache/cache_key.cc index addff61d17b0..a5553c0d257c 100644 --- a/cache/cache_key.cc +++ b/cache/cache_key.cc @@ -24,7 +24,7 @@ namespace ROCKSDB_NAMESPACE { // 0 | >= 1<<63 | CreateUniqueForProcessLifetime // > 0 | any | OffsetableCacheKey.WithOffset -CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) { +CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache* cache) { // +1 so that we can reserve all zeros for "unset" cache key uint64_t id = cache->NewId() + 1; // Ensure we don't collide with CreateUniqueForProcessLifetime @@ -297,8 +297,8 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // // TODO: Nevertheless / regardless, an efficient way to detect (and thus // quantify) block cache corruptions, including collisions, should be added. -OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id, - const std::string &db_session_id, +OffsetableCacheKey::OffsetableCacheKey(const std::string& db_id, + const std::string& db_session_id, uint64_t file_number) { UniqueId64x2 internal_id; Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number, diff --git a/cache/cache_key.h b/cache/cache_key.h index 0b93c6bd9472..4cf5d2e7d34b 100644 --- a/cache/cache_key.h +++ b/cache/cache_key.h @@ -44,13 +44,13 @@ class CacheKey { inline Slice AsSlice() const { static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key"); assert(!IsEmpty()); - return Slice(reinterpret_cast(this), sizeof(*this)); + return Slice(reinterpret_cast(this), sizeof(*this)); } // Create a CacheKey that is unique among others associated with this Cache // instance. Depends on Cache::NewId. This is useful for block cache // "reservations". - static CacheKey CreateUniqueForCacheLifetime(Cache *cache); + static CacheKey CreateUniqueForCacheLifetime(Cache* cache); // Create a CacheKey that is unique among others for the lifetime of this // process. This is useful for saving in a static data member so that @@ -87,7 +87,7 @@ class OffsetableCacheKey : private CacheKey { // Constructs an OffsetableCacheKey with the given information about a file. // This constructor never generates an "empty" base key. - OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id, + OffsetableCacheKey(const std::string& db_id, const std::string& db_session_id, uint64_t file_number); // Creates an OffsetableCacheKey from an SST unique ID, so that cache keys @@ -134,9 +134,9 @@ class OffsetableCacheKey : private CacheKey { static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize, "8 byte common prefix expected"); assert(!IsEmpty()); - assert(&this->file_num_etc64_ == static_cast(this)); + assert(&this->file_num_etc64_ == static_cast(this)); - return Slice(reinterpret_cast(this), kCommonPrefixSize); + return Slice(reinterpret_cast(this), kCommonPrefixSize); } }; diff --git a/cache/cache_reservation_manager.h b/cache/cache_reservation_manager.h index a7b06dea2073..deff5be8a285 100644 --- a/cache/cache_reservation_manager.h +++ b/cache/cache_reservation_manager.h @@ -44,8 +44,8 @@ class CacheReservationManager { bool increase) = 0; virtual Status MakeCacheReservation( std::size_t incremental_memory_used, - std::unique_ptr - *handle) = 0; + std::unique_ptr* + handle) = 0; virtual std::size_t GetTotalReservedCacheSize() = 0; virtual std::size_t GetTotalMemoryUsed() = 0; }; @@ -90,11 +90,11 @@ class CacheReservationManagerImpl bool delayed_decrease = false); // no copy constructor, copy assignment, move constructor, move assignment - CacheReservationManagerImpl(const CacheReservationManagerImpl &) = delete; - CacheReservationManagerImpl &operator=(const CacheReservationManagerImpl &) = + CacheReservationManagerImpl(const CacheReservationManagerImpl&) = delete; + CacheReservationManagerImpl& operator=(const CacheReservationManagerImpl&) = delete; - CacheReservationManagerImpl(CacheReservationManagerImpl &&) = delete; - CacheReservationManagerImpl &operator=(CacheReservationManagerImpl &&) = + CacheReservationManagerImpl(CacheReservationManagerImpl&&) = delete; + CacheReservationManagerImpl& operator=(CacheReservationManagerImpl&&) = delete; ~CacheReservationManagerImpl() override; @@ -178,7 +178,7 @@ class CacheReservationManagerImpl // REQUIRES: handle != nullptr Status MakeCacheReservation( std::size_t incremental_memory_used, - std::unique_ptr *handle) + std::unique_ptr* handle) override; // Return the size of the cache (which is a multiple of kSizeDummyEntry) @@ -200,7 +200,7 @@ class CacheReservationManagerImpl // For testing only - it is to help ensure the CacheItemHelperForRole // accessed from CacheReservationManagerImpl and the one accessed from the // test are from the same translation units - static const Cache::CacheItemHelper *TEST_GetCacheItemHelperForRole(); + static const Cache::CacheItemHelper* TEST_GetCacheItemHelperForRole(); private: static constexpr std::size_t kSizeDummyEntry = 256 * 1024; @@ -216,7 +216,7 @@ class CacheReservationManagerImpl bool delayed_decrease_; std::atomic cache_allocated_size_; std::size_t memory_used_; - std::vector dummy_handles_; + std::vector dummy_handles_; CacheKey cache_key_; }; @@ -251,14 +251,14 @@ class ConcurrentCacheReservationManager std::shared_ptr cache_res_mgr) { cache_res_mgr_ = std::move(cache_res_mgr); } - ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager &) = + ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager&) = delete; - ConcurrentCacheReservationManager &operator=( - const ConcurrentCacheReservationManager &) = delete; - ConcurrentCacheReservationManager(ConcurrentCacheReservationManager &&) = + ConcurrentCacheReservationManager& operator=( + const ConcurrentCacheReservationManager&) = delete; + ConcurrentCacheReservationManager(ConcurrentCacheReservationManager&&) = delete; - ConcurrentCacheReservationManager &operator=( - ConcurrentCacheReservationManager &&) = delete; + ConcurrentCacheReservationManager& operator=( + ConcurrentCacheReservationManager&&) = delete; ~ConcurrentCacheReservationManager() override {} @@ -286,7 +286,7 @@ class ConcurrentCacheReservationManager inline Status MakeCacheReservation( std::size_t incremental_memory_used, - std::unique_ptr *handle) + std::unique_ptr* handle) override { std::unique_ptr wrapped_handle; diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 12bcfe6cd437..b762fe4f8af7 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -644,7 +644,7 @@ using TypedHandle = SharedCache::TypedHandle; TEST_P(CacheTest, SetCapacity) { if (IsHyperClock()) { - // TODO: update test & code for limited supoort + // TODO: update test & code for limited support ROCKSDB_GTEST_BYPASS( "HyperClockCache doesn't support arbitrary capacity " "adjustments."); diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 090213cb0d02..70155791a41c 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -10,12 +10,12 @@ #include "cache/clock_cache.h" #include -#include #include #include #include #include #include +#include #include #include #include @@ -26,10 +26,9 @@ #include "cache/cache_key.h" #include "cache/secondary_cache_adapter.h" #include "logging/logging.h" -#include "monitoring/perf_context_imp.h" -#include "monitoring/statistics_impl.h" -#include "port/lang.h" +#include "port/likely.h" #include "rocksdb/env.h" +#include "util/autovector.h" #include "util/hash.h" #include "util/math.h" #include "util/random.h" @@ -39,13 +38,11 @@ namespace ROCKSDB_NAMESPACE { namespace clock_cache { namespace { -inline uint64_t GetRefcount(uint64_t meta) { - return ((meta >> ClockHandle::kAcquireCounterShift) - - (meta >> ClockHandle::kReleaseCounterShift)) & - ClockHandle::kCounterMask; -} +using SlotMeta = ClockHandle::SlotMeta; +using AcquireCounter = SlotMeta::AcquireCounter; +using ReleaseCounter = SlotMeta::ReleaseCounter; -inline uint64_t GetInitialCountdown(Cache::Priority priority) { +inline uint32_t GetInitialCountdown(Cache::Priority priority) { // Set initial clock data from priority // TODO: configuration parameters for priority handling and clock cycle // count? @@ -66,11 +63,11 @@ inline uint64_t GetInitialCountdown(Cache::Priority priority) { inline void MarkEmpty(ClockHandle& h) { #ifndef NDEBUG // Mark slot as empty, with assertion - uint64_t meta = h.meta.Exchange(0); - assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction); + auto old_meta = h.meta.Exchange({}); + assert(old_meta.IsUnderConstruction()); #else // Mark slot as empty - h.meta.Store(0); + h.meta.Store({}); #endif } @@ -86,18 +83,20 @@ inline void FreeDataMarkEmpty(ClockHandle& h, MemoryAllocator* allocator) { // Called to undo the effect of referencing an entry for internal purposes, // so it should not be marked as having been used. -inline void Unref(const ClockHandle& h, uint64_t count = 1) { +inline void Unref(const ClockHandle& h, uint32_t count = 1) { // Pretend we never took the reference // WART: there's a tiny chance we release last ref to invisible // entry here. If that happens, we let eviction take care of it. - uint64_t old_meta = h.meta.FetchSub(ClockHandle::kAcquireIncrement * count); - assert(GetRefcount(old_meta) != 0); + SlotMeta old_meta; + h.meta.Apply(AcquireCounter::MinusTransformPromiseNoUnderflow(count), + &old_meta); + assert(old_meta.GetRefcount() != 0); (void)old_meta; } inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data, bool* purgeable = nullptr) { - uint64_t meta; + SlotMeta meta; if (purgeable) { assert(*purgeable == false); // In AutoHCC, our eviction process follows the chain structure, so we @@ -111,46 +110,40 @@ inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data, meta = h.meta.LoadRelaxed(); } - if (((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) == - 0) { + if (!meta.IsShareable()) { // Only clock update Shareable entries if (purgeable) { *purgeable = true; // AutoHCC only: make sure we only attempt to update non-empty slots - assert((meta >> ClockHandle::kStateShift) & - ClockHandle::kStateOccupiedBit); + assert(!meta.IsEmpty()); } return false; } - uint64_t acquire_count = - (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask; - uint64_t release_count = - (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask; + uint32_t acquire_count = meta.GetAcquireCounter(); + uint32_t release_count = meta.GetReleaseCounter(); if (acquire_count != release_count) { // Only clock update entries with no outstanding refs data->seen_pinned_count++; return false; } - if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && - acquire_count > 0) { + if (meta.IsVisible() && acquire_count > 0) { // Decrement clock - uint64_t new_count = - std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1); + uint32_t new_count = + std::min(acquire_count - 1, uint32_t{ClockHandle::kMaxCountdown} - 1); // Compare-exchange in the decremented clock info, but // not aggressively - uint64_t new_meta = - (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) | - (meta & ClockHandle::kHitBitMask) | - (new_count << ClockHandle::kReleaseCounterShift) | - (new_count << ClockHandle::kAcquireCounterShift); + SlotMeta new_meta = meta; + new_meta.SetReleaseCounter(new_count); + new_meta.SetAcquireCounter(new_count); h.meta.CasStrongRelaxed(meta, new_meta); return false; } // Otherwise, remove entry (either unreferenced invisible or // unreferenced and expired visible). - if (h.meta.CasStrong(meta, (uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift) | - (meta & ClockHandle::kHitBitMask))) { + SlotMeta construction_meta; + construction_meta.SetUnderConstruction(); + construction_meta.SetHit(meta.GetHit()); + if (h.meta.CasStrong(meta, construction_meta)) { // Took ownership. data->freed_charge += h.GetTotalCharge(); data->freed_count += 1; @@ -216,39 +209,39 @@ inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data, // counter to reach "high" state again and bumped back to "medium." (This // motivates only checking for release counter in high state, not both in high // state.) -inline void CorrectNearOverflow(uint64_t old_meta, - AcqRelAtomic& meta) { +inline void CorrectNearOverflow(SlotMeta old_meta, + BitFieldsAtomic& meta) { // We clear both top-most counter bits at the same time. - constexpr uint64_t kCounterTopBit = uint64_t{1} - << (ClockHandle::kCounterNumBits - 1); - constexpr uint64_t kClearBits = - (kCounterTopBit << ClockHandle::kAcquireCounterShift) | - (kCounterTopBit << ClockHandle::kReleaseCounterShift); - // A simple check that allows us to initiate clearing the top bits for - // a large portion of the "high" state space on release counter. - constexpr uint64_t kCheckBits = - (kCounterTopBit | (ClockHandle::kMaxCountdown + 1)) - << ClockHandle::kReleaseCounterShift; + constexpr uint32_t kCounterTopBit = uint32_t{1} + << (SlotMeta::kCounterNumBits - 1); + // The threshold for correcting "near overflow" is to ensure + // (a) the value has a top bit set that can be cleared + // (b) when we clear the top bit, the eviction state will be preserved + // (everything >= kMaxCountdown is treated equivalently) + // As mentioned above, we only check the release count. + constexpr uint32_t kThreshold = kCounterTopBit + ClockHandle::kMaxCountdown; - if (UNLIKELY(old_meta & kCheckBits)) { - meta.FetchAndRelaxed(~kClearBits); + if (UNLIKELY(old_meta.GetReleaseCounter() > kThreshold)) { + auto clear_transform = AcquireCounter::AndTransform(kCounterTopBit - 1) + + ReleaseCounter::AndTransform(kCounterTopBit - 1); + meta.ApplyRelaxed(clear_transform); } } inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, - uint64_t initial_countdown, bool* already_matches) { + uint32_t initial_countdown, bool* already_matches) { assert(*already_matches == false); // Optimistically transition the slot from "empty" to // "under construction" (no effect on other states) - uint64_t old_meta = h.meta.FetchOr(uint64_t{ClockHandle::kStateOccupiedBit} - << ClockHandle::kStateShift); - uint64_t old_state = old_meta >> ClockHandle::kStateShift; + auto set_occupied = SlotMeta::OccupiedFlag::SetTransform(); + SlotMeta old_meta; + h.meta.Apply(set_occupied, &old_meta); - if (old_state == ClockHandle::kStateEmpty) { + if (old_meta.IsEmpty()) { // We've started inserting into an available slot, and taken // ownership. return true; - } else if (old_state != ClockHandle::kStateVisible) { + } else if (!old_meta.IsVisible()) { // Slot not usable / touchable now return false; } @@ -256,15 +249,17 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, // But first, we need to acquire a ref to read it. In fact, number of // refs for initial countdown, so that we boost the clock state if // this is a match. - old_meta = - h.meta.FetchAdd(ClockHandle::kAcquireIncrement * initial_countdown); + auto add_acquire = + AcquireCounter::PlusTransformPromiseNoOverflow(initial_countdown); + h.meta.Apply(add_acquire, &old_meta); // Like Lookup - if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) { + if (old_meta.IsVisible()) { // Acquired a read reference if (h.hashed_key == proto.hashed_key) { // Match. Release in a way that boosts the clock state - old_meta = - h.meta.FetchAdd(ClockHandle::kReleaseIncrement * initial_countdown); + auto add_release = + ReleaseCounter::PlusTransformPromiseNoOverflow(initial_countdown); + h.meta.Apply(add_release, &old_meta); // Correct for possible (but rare) overflow CorrectNearOverflow(old_meta, h.meta); // Insert detached instead (only if return handle needed) @@ -274,8 +269,7 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, // Mismatch. Unref(h, initial_countdown); } - } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateInvisible)) { + } else if (UNLIKELY(old_meta.IsInvisible())) { // Pretend we never took the reference Unref(h, initial_countdown); } else { @@ -287,25 +281,23 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, } inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, - uint64_t initial_countdown, bool keep_ref) { + uint32_t initial_countdown, bool keep_ref) { // Save data fields ClockHandleBasicData* h_alias = &h; *h_alias = proto; // Transition from "under construction" state to "visible" state - uint64_t new_meta = uint64_t{ClockHandle::kStateVisible} - << ClockHandle::kStateShift; + SlotMeta new_meta; + new_meta.SetVisible(); // Maybe with an outstanding reference - new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift; - new_meta |= (initial_countdown - keep_ref) - << ClockHandle::kReleaseCounterShift; + new_meta.SetAcquireCounter(initial_countdown); + new_meta.SetReleaseCounter(initial_countdown - (keep_ref ? 1 : 0)); #ifndef NDEBUG // Save the state transition, with assertion - uint64_t old_meta = h.meta.Exchange(new_meta); - assert(old_meta >> ClockHandle::kStateShift == - ClockHandle::kStateConstruction); + auto old_meta = h.meta.Exchange(new_meta); + assert(old_meta.IsUnderConstruction()); #else // Save the state transition h.meta.Store(new_meta); @@ -313,7 +305,7 @@ inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, } bool TryInsert(const ClockHandleBasicData& proto, ClockHandle& h, - uint64_t initial_countdown, bool keep_ref, + uint32_t initial_countdown, bool keep_ref, bool* already_matches) { bool b = BeginSlotInsert(proto, h, initial_countdown, already_matches); if (b) { @@ -327,50 +319,40 @@ template void ConstApplyToEntriesRange(const Func& func, const HandleImpl* begin, const HandleImpl* end, bool apply_if_will_be_deleted) { - uint64_t check_state_mask = ClockHandle::kStateShareableBit; - if (!apply_if_will_be_deleted) { - check_state_mask |= ClockHandle::kStateVisibleBit; - } - for (const HandleImpl* h = begin; h < end; ++h) { // Note: to avoid using compare_exchange, we have to be extra careful. - uint64_t old_meta = h->meta.LoadRelaxed(); + SlotMeta old_meta = h->meta.LoadRelaxed(); // Check if it's an entry visible to lookups - if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { - // Increment acquire counter. Note: it's possible that the entry has - // completely changed since we loaded old_meta, but incrementing acquire - // count is always safe. (Similar to optimistic Lookup here.) - old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); - // Check whether we actually acquired a reference. - if ((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit) { - // Apply func if appropriate - if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { - func(*h); + if (apply_if_will_be_deleted || old_meta.IsVisible()) { + if (old_meta.IsShareable()) { + // Increment acquire counter. Note: it's possible that the entry has + // completely changed since we loaded old_meta, but incrementing acquire + // count is always safe. (Similar to optimistic Lookup here.) + auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1); + h->meta.Apply(add_acquire, &old_meta); + // Check whether we actually acquired a reference. + if (old_meta.IsShareable()) { + // Apply func if appropriate + if (apply_if_will_be_deleted || old_meta.IsVisible()) { + func(*h); + } + // Pretend we never took the reference + Unref(*h); + // No net change, so don't need to check for overflow + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. } - // Pretend we never took the reference - Unref(*h); - // No net change, so don't need to check for overflow - } else { - // For other states, incrementing the acquire counter has no effect - // so we don't need to undo it. Furthermore, we cannot safely undo - // it because we did not acquire a read reference to lock the - // entry in a Shareable state. } } } } -constexpr uint32_t kStrictCapacityLimitBit = 1u << 31; - -uint32_t SanitizeEncodeEecAndScl(int eviction_effort_cap, - bool strict_capacit_limit) { +uint32_t SanitizeEvictionEffortCap(int eviction_effort_cap) { eviction_effort_cap = std::max(int{1}, eviction_effort_cap); - eviction_effort_cap = - std::min(static_cast(~kStrictCapacityLimitBit), eviction_effort_cap); - uint32_t eec_and_scl = static_cast(eviction_effort_cap); - eec_and_scl |= strict_capacit_limit ? kStrictCapacityLimitBit : 0; - return eec_and_scl; + return static_cast(eviction_effort_cap); } } // namespace @@ -381,6 +363,22 @@ void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const { } } +BaseClockTable::BaseClockTable(size_t capacity, bool strict_capacity_limit, + int eviction_effort_cap, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, + const Cache::EvictionCallback* eviction_callback, + const uint32_t* hash_seed) + : capacity_(capacity), + eec_and_scl_(EecAndScl{} + .With( + SanitizeEvictionEffortCap(eviction_effort_cap)) + .With(strict_capacity_limit)), + metadata_charge_policy_(metadata_charge_policy), + allocator_(allocator), + eviction_callback_(*eviction_callback), + hash_seed_(*hash_seed) {} + template HandleImpl* BaseClockTable::StandaloneInsert( const ClockHandleBasicData& proto) { @@ -391,9 +389,9 @@ HandleImpl* BaseClockTable::StandaloneInsert( h->SetStandalone(); // Single reference (standalone entries only created if returning a refed // Handle back to user) - uint64_t meta = uint64_t{ClockHandle::kStateInvisible} - << ClockHandle::kStateShift; - meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; + SlotMeta meta; + meta.SetInvisible(); + meta.SetAcquireCounter(1); h->meta.Store(meta); // Keep track of how much of usage is standalone standalone_usage_.FetchAddRelaxed(proto.GetTotalCharge()); @@ -402,8 +400,7 @@ HandleImpl* BaseClockTable::StandaloneInsert( template typename Table::HandleImpl* BaseClockTable::CreateStandalone( - ClockHandleBasicData& proto, size_t capacity, uint32_t eec_and_scl, - bool allow_uncharged) { + ClockHandleBasicData& proto, bool allow_uncharged) { Table& derived = static_cast(*this); typename Table::InsertState state; derived.StartInsert(state); @@ -412,10 +409,10 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone( // NOTE: we can use eec_and_scl as eviction_effort_cap below because // strict_capacity_limit=true is supposed to disable the limit on eviction // effort, and a large value effectively does that. - if (eec_and_scl & kStrictCapacityLimitBit) { + if (eec_and_scl_.LoadRelaxed().Get()) { Status s = ChargeUsageMaybeEvictStrict( - total_charge, capacity, - /*need_evict_for_occupancy=*/false, eec_and_scl, state); + total_charge, + /*need_evict_for_occupancy=*/false, state); if (!s.ok()) { if (allow_uncharged) { proto.total_charge = 0; @@ -426,8 +423,8 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone( } else { // Case strict_capacity_limit == false bool success = ChargeUsageMaybeEvictNonStrict
( - total_charge, capacity, - /*need_evict_for_occupancy=*/false, eec_and_scl, state); + total_charge, + /*need_evict_for_occupancy=*/false, state); if (!success) { // Force the issue usage_.FetchAddRelaxed(total_charge); @@ -439,8 +436,9 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone( template Status BaseClockTable::ChargeUsageMaybeEvictStrict( - size_t total_charge, size_t capacity, bool need_evict_for_occupancy, - uint32_t eviction_effort_cap, typename Table::InsertState& state) { + size_t total_charge, bool need_evict_for_occupancy, + typename Table::InsertState& state) { + const size_t capacity = capacity_.LoadRelaxed(); if (total_charge > capacity) { return Status::MemoryLimit( "Cache entry too large for a single cache shard: " + @@ -465,8 +463,7 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict( } if (request_evict_charge > 0) { EvictionData data; - static_cast(this)->Evict(request_evict_charge, state, &data, - eviction_effort_cap); + static_cast(this)->Evict(request_evict_charge, state, &data); occupancy_.FetchSub(data.freed_count); if (LIKELY(data.freed_charge > need_evict_charge)) { assert(data.freed_count > 0); @@ -495,8 +492,8 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict( template inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( - size_t total_charge, size_t capacity, bool need_evict_for_occupancy, - uint32_t eviction_effort_cap, typename Table::InsertState& state) { + size_t total_charge, bool need_evict_for_occupancy, + typename Table::InsertState& state) { // For simplicity, we consider that either the cache can accept the insert // with no evictions, or we must evict enough to make (at least) enough // space. It could lead to unnecessary failures or excessive evictions in @@ -506,7 +503,8 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( // charge. Thus, we should evict some extra if it's not a signifcant // portion of the shard capacity. This can have the side benefit of // involving fewer threads in eviction. - size_t old_usage = usage_.LoadRelaxed(); + const size_t old_usage = usage_.LoadRelaxed(); + const size_t capacity = capacity_.LoadRelaxed(); size_t need_evict_charge; // NOTE: if total_charge > old_usage, there isn't yet enough to evict // `total_charge` amount. Even if we only try to evict `old_usage` amount, @@ -532,8 +530,7 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( } EvictionData data; if (need_evict_charge > 0) { - static_cast(this)->Evict(need_evict_charge, state, &data, - eviction_effort_cap); + static_cast(this)->Evict(need_evict_charge, state, &data); // Deal with potential occupancy deficit if (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0) { assert(data.freed_charge == 0); @@ -557,11 +554,10 @@ void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) { if (eviction_callback_) { // For key reconstructed from hash UniqueId64x2 unhashed; - took_value_ownership = - eviction_callback_(ClockCacheShard::ReverseHash( - h->GetHash(), &unhashed, hash_seed_), - static_cast(h), - h->meta.LoadRelaxed() & ClockHandle::kHitBitMask); + took_value_ownership = eviction_callback_( + ClockCacheShard::ReverseHash( + h->GetHash(), &unhashed, hash_seed_), + static_cast(h), h->meta.LoadRelaxed().GetHit()); } if (!took_value_ownership) { h->FreeData(allocator_); @@ -569,8 +565,10 @@ void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) { MarkEmpty(*h); } -bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data, - uint32_t eviction_effort_cap) { +bool BaseClockTable::IsEvictionEffortExceeded( + const BaseClockTable::EvictionData& data) const { + auto eviction_effort_cap = + eec_and_scl_.LoadRelaxed().GetEffectiveEvictionEffortCap(); // Basically checks whether the ratio of useful effort to wasted effort is // too low, with a start-up allowance for wasted effort before any useful // effort. @@ -581,8 +579,7 @@ bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data, template Status BaseClockTable::Insert(const ClockHandleBasicData& proto, typename Table::HandleImpl** handle, - Cache::Priority priority, size_t capacity, - uint32_t eec_and_scl) { + Cache::Priority priority) { using HandleImpl = typename Table::HandleImpl; Table& derived = static_cast(*this); @@ -603,9 +600,9 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, // NOTE: we can use eec_and_scl as eviction_effort_cap below because // strict_capacity_limit=true is supposed to disable the limit on eviction // effort, and a large value effectively does that. - if (eec_and_scl & kStrictCapacityLimitBit) { + if (eec_and_scl_.LoadRelaxed().Get()) { Status s = ChargeUsageMaybeEvictStrict
( - total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state); + total_charge, need_evict_for_occupancy, state); if (!s.ok()) { // Revert occupancy occupancy_.FetchSubRelaxed(1); @@ -614,7 +611,7 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, } else { // Case strict_capacity_limit == false bool success = ChargeUsageMaybeEvictNonStrict
( - total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state); + total_charge, need_evict_for_occupancy, state); if (!success) { // Revert occupancy occupancy_.FetchSubRelaxed(1); @@ -640,7 +637,7 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, // * Have to insert into a suboptimal location (more probes) so that the // old entry can be kept around as well. - uint64_t initial_countdown = GetInitialCountdown(priority); + uint32_t initial_countdown = GetInitialCountdown(priority); assert(initial_countdown > 0); HandleImpl* e = @@ -685,44 +682,46 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto, void BaseClockTable::Ref(ClockHandle& h) { // Increment acquire counter - uint64_t old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement); + SlotMeta old_meta; + h.meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1), &old_meta); - assert((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit); + assert(old_meta.IsShareable()); // Must have already had a reference - assert(GetRefcount(old_meta) > 0); + assert(old_meta.GetRefcount() > 0); (void)old_meta; } #ifndef NDEBUG -void BaseClockTable::TEST_RefN(ClockHandle& h, size_t n) { +void BaseClockTable::TEST_RefN(ClockHandle& h, uint32_t n) { // Increment acquire counter - uint64_t old_meta = h.meta.FetchAdd(n * ClockHandle::kAcquireIncrement); + SlotMeta old_meta; + h.meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(n), &old_meta); - assert((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit); + assert(old_meta.IsShareable()); (void)old_meta; } -void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, size_t n) { +void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, uint32_t n) { assert(n > 0); // Like n-1 Releases, but assumes one more will happen in the caller to take // care of anything like erasing an unreferenced, invisible entry. - uint64_t old_meta = - h->meta.FetchAdd((n - 1) * ClockHandle::kReleaseIncrement); - assert((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit); + SlotMeta old_meta; + h->meta.Apply(ReleaseCounter::PlusTransformPromiseNoOverflow(n - 1), + &old_meta); + assert(old_meta.IsShareable()); (void)old_meta; } #endif FixedHyperClockTable::FixedHyperClockTable( - size_t capacity, CacheMetadataChargePolicy metadata_charge_policy, + size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const Opts& opts) - : BaseClockTable(metadata_charge_policy, allocator, eviction_callback, + : BaseClockTable(capacity, strict_capacity_limit, opts.eviction_effort_cap, + metadata_charge_policy, allocator, eviction_callback, hash_seed), length_bits_(CalcHashBits(capacity, opts.estimated_value_size, metadata_charge_policy)), @@ -744,23 +743,20 @@ FixedHyperClockTable::~FixedHyperClockTable() { // in the table. for (size_t i = 0; i < GetTableSize(); i++) { HandleImpl& h = array_[i]; - switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) { - case ClockHandle::kStateEmpty: - // noop - break; - case ClockHandle::kStateInvisible: // rare but possible - case ClockHandle::kStateVisible: - assert(GetRefcount(h.meta.LoadRelaxed()) == 0); - h.FreeData(allocator_); + SlotMeta meta = h.meta.LoadRelaxed(); + if (meta.IsShareable()) { + // NOTE: Reaching here invisible is rare but possible + assert(meta.GetRefcount() == 0); + h.FreeData(allocator_); #ifndef NDEBUG - Rollback(h.hashed_key, &h); - ReclaimEntryUsage(h.GetTotalCharge()); + Rollback(h.hashed_key, &h); + ReclaimEntryUsage(h.GetTotalCharge()); #endif - break; - // otherwise - default: - assert(false); - break; + } else { + // Should be no transient "under construction" states unless a thread + // was killed or we are being destructed while another thread is still + // operating on the structure + assert(meta.IsEmpty()); } } @@ -782,7 +778,7 @@ bool FixedHyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) { } FixedHyperClockTable::HandleImpl* FixedHyperClockTable::DoInsert( - const ClockHandleBasicData& proto, uint64_t initial_countdown, + const ClockHandleBasicData& proto, uint32_t initial_countdown, bool keep_ref, InsertState&) { bool already_matches = false; HandleImpl* e = FindSlot( @@ -833,47 +829,46 @@ FixedHyperClockTable::HandleImpl* FixedHyperClockTable::Lookup( HandleImpl* e = FindSlot( hashed_key, [&](HandleImpl* h) { + SlotMeta old_meta; // Mostly branch-free version (similar performance) /* - uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement, - std::memory_order_acquire); - bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U; - bool visible = (old_meta >> ClockHandle::kStateShift) & 1U; - bool match = (h->key == key) & visible; - h->meta.FetchSub(static_cast(Shareable & !match) << - ClockHandle::kAcquireCounterShift); return - match; + h->meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1), + &old_meta); + bool shareable = old_meta.IsShareable(); + bool visible = old_meta.IsVisible(); + bool match = (h->hashed_key == hashed_key) & visible; + h->meta.Apply(AcquireCounter::MinusTransformPromiseNoUnderflow( + uint32_t{shareable} & uint32_t{!match})); + h->meta.Apply(SlotMeta::HitFlag::Or(match)); + return match; */ // Optimistic lookup should pay off when the table is relatively // sparse. constexpr bool kOptimisticLookup = true; - uint64_t old_meta; if (!kOptimisticLookup) { old_meta = h->meta.Load(); - if ((old_meta >> ClockHandle::kStateShift) != - ClockHandle::kStateVisible) { + if (!old_meta.IsVisible()) { return false; } } // (Optimistically) increment acquire counter - old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); + h->meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1), + &old_meta); // Check if it's an entry visible to lookups - if ((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateVisible) { + if (old_meta.IsVisible()) { // Acquired a read reference if (h->hashed_key == hashed_key) { // Match // Update the hit bit if (eviction_callback_) { - h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift); + h->meta.ApplyRelaxed(SlotMeta::HitFlag::SetTransform()); } return true; } else { // Mismatch. Pretend we never took the reference Unref(*h); } - } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateInvisible)) { + } else if (UNLIKELY(old_meta.IsInvisible())) { // Pretend we never took the reference Unref(*h); } else { @@ -897,53 +892,49 @@ bool FixedHyperClockTable::Release(HandleImpl* h, bool useful, // is only freed up by EvictFromClock (called by Insert when space is needed) // and Erase. We do this to avoid an extra atomic read of the variable usage_. - uint64_t old_meta; + SlotMeta old_meta; if (useful) { // Increment release counter to indicate was used - old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement); + auto add_release = ReleaseCounter::PlusTransformPromiseNoOverflow(1); + h->meta.Apply(add_release, &old_meta); } else { // Decrement acquire counter to pretend it never happened - old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement); + auto sub_acquire = AcquireCounter::MinusTransformPromiseNoUnderflow(1); + h->meta.Apply(sub_acquire, &old_meta); } - assert((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit); + assert(old_meta.IsShareable()); // No underflow - assert(((old_meta >> ClockHandle::kAcquireCounterShift) & - ClockHandle::kCounterMask) != - ((old_meta >> ClockHandle::kReleaseCounterShift) & - ClockHandle::kCounterMask)); + assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter()); - if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == - ClockHandle::kStateInvisible)) { + if (erase_if_last_ref || UNLIKELY(old_meta.IsInvisible())) { // FIXME: There's a chance here that another thread could replace this // entry and we end up erasing the wrong one. - // Update for last FetchAdd op + // Update for last Apply op if (useful) { - old_meta += ClockHandle::kReleaseIncrement; + old_meta.SetReleaseCounter(old_meta.GetReleaseCounter() + 1); } else { - old_meta -= ClockHandle::kAcquireIncrement; + old_meta.SetAcquireCounter(old_meta.GetAcquireCounter() - 1); } // Take ownership if no refs + SlotMeta construction_meta; + construction_meta.SetUnderConstruction(); do { - if (GetRefcount(old_meta) != 0) { + if (old_meta.GetRefcount() != 0) { // Not last ref at some point in time during this Release call // Correct for possible (but rare) overflow CorrectNearOverflow(old_meta, h->meta); return false; } - if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit} - << ClockHandle::kStateShift)) == 0) { + if (!old_meta.IsShareable()) { // Someone else took ownership return false; } // Note that there's a small chance that we release, another thread // replaces this entry with another, reaches zero refs, and then we end // up erasing that other entry. That's an acceptable risk / imprecision. - } while ( - !h->meta.CasWeak(old_meta, uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift)); + } while (!h->meta.CasWeak(old_meta, construction_meta)); // Took ownership size_t total_charge = h->GetTotalCharge(); if (UNLIKELY(h->IsStandalone())) { @@ -966,7 +957,7 @@ bool FixedHyperClockTable::Release(HandleImpl* h, bool useful, } #ifndef NDEBUG -void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { +void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, uint32_t n) { if (n > 0) { // Do n-1 simple releases first TEST_ReleaseNMinus1(h, n); @@ -983,30 +974,29 @@ void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) { [&](HandleImpl* h) { // Could be multiple entries in rare cases. Erase them all. // Optimistically increment acquire counter - uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); + auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1); + SlotMeta old_meta, meta; + h->meta.Apply(add_acquire, &old_meta, &meta); // Check if it's an entry visible to lookups - if ((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateVisible) { + if (meta.IsVisible()) { // Acquired a read reference if (h->hashed_key == hashed_key) { - // Match. Set invisible. - old_meta = - h->meta.FetchAnd(~(uint64_t{ClockHandle::kStateVisibleBit} - << ClockHandle::kStateShift)); - // Apply update to local copy - old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit} - << ClockHandle::kStateShift); + // Match. Take ownership if no other refs, or set invisible other + // refs exist. for (;;) { - uint64_t refcount = GetRefcount(old_meta); + uint32_t refcount = meta.GetRefcount(); assert(refcount > 0); if (refcount > 1) { // Not last ref at some point in time during this Erase call - // Pretend we never took the reference + // Set invisible + h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform()); + // And pretend we never took the reference Unref(*h); break; - } else if (h->meta.CasWeak( - old_meta, uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift)) { + } + SlotMeta construction_meta; + construction_meta.SetUnderConstruction(); + if (h->meta.CasWeak(meta, construction_meta)) { // Took ownership assert(hashed_key == h->hashed_key); size_t total_charge = h->GetTotalCharge(); @@ -1022,8 +1012,7 @@ void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) { // Mismatch. Pretend we never took the reference Unref(*h); } - } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateInvisible)) { + } else if (UNLIKELY(old_meta.IsInvisible())) { // Pretend we never took the reference Unref(*h); } else { @@ -1040,17 +1029,17 @@ void FixedHyperClockTable::EraseUnRefEntries() { for (size_t i = 0; i <= this->length_bits_mask_; i++) { HandleImpl& h = array_[i]; - uint64_t old_meta = h.meta.LoadRelaxed(); - if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} - << ClockHandle::kStateShift) && - GetRefcount(old_meta) == 0 && - h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift)) { - // Took ownership - size_t total_charge = h.GetTotalCharge(); - Rollback(h.hashed_key, &h); - FreeDataMarkEmpty(h, allocator_); - ReclaimEntryUsage(total_charge); + SlotMeta old_meta = h.meta.LoadRelaxed(); + if (old_meta.IsShareable() && old_meta.GetRefcount() == 0) { + SlotMeta construction_meta; + construction_meta.SetUnderConstruction(); + if (h.meta.CasStrong(old_meta, construction_meta)) { + // Took ownership + size_t total_charge = h.GetTotalCharge(); + Rollback(h.hashed_key, &h); + FreeDataMarkEmpty(h, allocator_); + ReclaimEntryUsage(total_charge); + } } } } @@ -1113,8 +1102,7 @@ inline void FixedHyperClockTable::ReclaimEntryUsage(size_t total_charge) { } inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, - EvictionData* data, - uint32_t eviction_effort_cap) { + EvictionData* data) { // precondition assert(requested_charge > 0); @@ -1149,7 +1137,7 @@ inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, if (old_clock_pointer >= max_clock_pointer) { return; } - if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) { + if (IsEvictionEffortExceeded(*data)) { eviction_effort_exceeded_count_.FetchAddRelaxed(1); return; } @@ -1167,14 +1155,11 @@ ClockCacheShard
::ClockCacheShard( const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const typename Table::Opts& opts) : CacheShardBase(metadata_charge_policy), - table_(capacity, metadata_charge_policy, allocator, eviction_callback, - hash_seed, opts), - capacity_(capacity), - eec_and_scl_(SanitizeEncodeEecAndScl(opts.eviction_effort_cap, - strict_capacity_limit)) { + table_(capacity, strict_capacity_limit, metadata_charge_policy, allocator, + eviction_callback, hash_seed, opts) { // Initial charge metadata should not exceed capacity - assert(table_.GetUsage() <= capacity_.LoadRelaxed() || - capacity_.LoadRelaxed() < sizeof(HandleImpl)); + assert(table_.GetUsage() <= table_.GetCapacity() || + table_.GetCapacity() < sizeof(HandleImpl)); } template @@ -1240,18 +1225,14 @@ int FixedHyperClockTable::CalcHashBits( template void ClockCacheShard
::SetCapacity(size_t capacity) { - capacity_.StoreRelaxed(capacity); + table_.SetCapacity(capacity); // next Insert will take care of any necessary evictions } template void ClockCacheShard
::SetStrictCapacityLimit( bool strict_capacity_limit) { - if (strict_capacity_limit) { - eec_and_scl_.FetchOrRelaxed(kStrictCapacityLimitBit); - } else { - eec_and_scl_.FetchAndRelaxed(~kStrictCapacityLimitBit); - } + table_.SetStrictCapacityLimit(strict_capacity_limit); // next Insert will take care of any necessary evictions } @@ -1271,9 +1252,7 @@ Status ClockCacheShard
::Insert(const Slice& key, proto.value = value; proto.helper = helper; proto.total_charge = charge; - return table_.template Insert
(proto, handle, priority, - capacity_.LoadRelaxed(), - eec_and_scl_.LoadRelaxed()); + return table_.template Insert
(proto, handle, priority); } template @@ -1288,9 +1267,7 @@ typename Table::HandleImpl* ClockCacheShard
::CreateStandalone( proto.value = obj; proto.helper = helper; proto.total_charge = charge; - return table_.template CreateStandalone
(proto, capacity_.LoadRelaxed(), - eec_and_scl_.LoadRelaxed(), - allow_uncharged); + return table_.template CreateStandalone
(proto, allow_uncharged); } template @@ -1322,12 +1299,12 @@ bool ClockCacheShard
::Release(HandleImpl* handle, bool useful, #ifndef NDEBUG template -void ClockCacheShard
::TEST_RefN(HandleImpl* h, size_t n) { +void ClockCacheShard
::TEST_RefN(HandleImpl* h, uint32_t n) { table_.TEST_RefN(*h, n); } template -void ClockCacheShard
::TEST_ReleaseN(HandleImpl* h, size_t n) { +void ClockCacheShard
::TEST_ReleaseN(HandleImpl* h, uint32_t n) { table_.TEST_ReleaseN(h, n); } #endif @@ -1359,7 +1336,7 @@ size_t ClockCacheShard
::GetStandaloneUsage() const { template size_t ClockCacheShard
::GetCapacity() const { - return capacity_.LoadRelaxed(); + return table_.GetCapacity(); } template @@ -1375,8 +1352,8 @@ size_t ClockCacheShard
::GetPinnedUsage() const { metadata_charge_policy_ == kFullChargeCacheMetadata; ConstApplyToEntriesRange( [&table_pinned_usage, charge_metadata](const HandleImpl& h) { - uint64_t meta = h.meta.LoadRelaxed(); - uint64_t refcount = GetRefcount(meta); + SlotMeta meta = h.meta.LoadRelaxed(); + uint32_t refcount = meta.GetRefcount(); // Holding one ref for ConstApplyToEntriesRange assert(refcount > 0); if (refcount > 1) { @@ -1496,7 +1473,7 @@ void AddShardEvaluation(const FixedHyperClockCache::Shard& shard, } bool IsSlotOccupied(const ClockHandle& h) { - return (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) != 0; + return !h.meta.LoadRelaxed().IsEmpty(); } } // namespace @@ -1727,10 +1704,13 @@ inline uint64_t UsedLengthToLengthInfo(size_t used_length) { return length_info; } +// Avoid potential initialization order race with port::kPageSize +constexpr size_t kPresumedPageSize = 4096; + inline size_t GetStartingLength(size_t capacity) { - if (capacity > port::kPageSize) { + if (capacity > kPresumedPageSize) { // Start with one memory page - return port::kPageSize / sizeof(AutoHyperClockTable::HandleImpl); + return kPresumedPageSize / sizeof(AutoHyperClockTable::HandleImpl); } else { // Mostly to make unit tests happy return 4; @@ -1751,26 +1731,6 @@ inline void GetHomeIndexAndShift(uint64_t length_info, uint64_t hash, assert(*home < LengthInfoToUsedLength(length_info)); } -inline int GetShiftFromNextWithShift(uint64_t next_with_shift) { - return BitwiseAnd(next_with_shift, - AutoHyperClockTable::HandleImpl::kShiftMask); -} - -inline size_t GetNextFromNextWithShift(uint64_t next_with_shift) { - return static_cast(next_with_shift >> - AutoHyperClockTable::HandleImpl::kNextShift); -} - -inline uint64_t MakeNextWithShift(size_t next, int shift) { - return (uint64_t{next} << AutoHyperClockTable::HandleImpl::kNextShift) | - static_cast(shift); -} - -inline uint64_t MakeNextWithShiftEnd(size_t head, int shift) { - return AutoHyperClockTable::HandleImpl::kNextEndFlags | - MakeNextWithShift(head, shift); -} - // Helper function for Lookup inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h, int shift = 0, size_t home = 0, @@ -1778,12 +1738,12 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h, // Must be at least something to match assert(hashed_key || shift > 0); - uint64_t old_meta; + SlotMeta old_meta, new_meta; // (Optimistically) increment acquire counter. - old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement); + auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1); + h.meta.Apply(add_acquire, &old_meta, &new_meta); // Check if it's a referencable (sharable) entry - if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit} - << ClockHandle::kStateShift)) == 0) { + if (!old_meta.IsShareable()) { // For non-sharable states, incrementing the acquire counter has no effect // so we don't need to undo it. Furthermore, we cannot safely undo // it because we did not acquire a read reference to lock the @@ -1794,10 +1754,9 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h, return false; } // Else acquired a read reference - assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0); + assert(new_meta.GetRefcount() > 0); if (hashed_key && h.hashed_key == *hashed_key && - LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit} - << ClockHandle::kStateShift))) { + LIKELY(old_meta.IsVisible())) { // Match on full key, visible if (full_match_or_unknown) { *full_match_or_unknown = true; @@ -1820,36 +1779,39 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h, } } +using NextWithShift = AutoHyperClockTable::HandleImpl::NextWithShift; + // Assumes a chain rewrite lock prevents concurrent modification of // these chain pointers void UpgradeShiftsOnRange(AutoHyperClockTable::HandleImpl* arr, - size_t& frontier, uint64_t stop_before_or_new_tail, - int old_shift, int new_shift) { + size_t& frontier, + NextWithShift stop_before_or_new_tail, int old_shift, + int new_shift) { assert(frontier != SIZE_MAX); assert(new_shift == old_shift + 1); (void)old_shift; (void)new_shift; - using HandleImpl = AutoHyperClockTable::HandleImpl; for (;;) { - uint64_t next_with_shift = arr[frontier].chain_next_with_shift.Load(); - assert(GetShiftFromNextWithShift(next_with_shift) == old_shift); + NextWithShift next_with_shift = arr[frontier].chain_next_with_shift.Load(); + assert(next_with_shift.GetShift() == old_shift); if (next_with_shift == stop_before_or_new_tail) { // Stopping at entry with pointer matching "stop before" - assert(!HandleImpl::IsEnd(next_with_shift)); + assert(!next_with_shift.IsEnd()); return; } - if (HandleImpl::IsEnd(next_with_shift)) { + if (next_with_shift.IsEnd()) { // Also update tail to new tail - assert(HandleImpl::IsEnd(stop_before_or_new_tail)); + assert(stop_before_or_new_tail.IsEnd()); arr[frontier].chain_next_with_shift.Store(stop_before_or_new_tail); // Mark nothing left to upgrade frontier = SIZE_MAX; return; } // Next is another entry to process, so upgrade and advance frontier - arr[frontier].chain_next_with_shift.FetchAdd(1U); - assert(GetShiftFromNextWithShift(next_with_shift + 1) == new_shift); - frontier = GetNextFromNextWithShift(next_with_shift); + arr[frontier].chain_next_with_shift.Apply( + NextWithShift::Shift::PlusTransformPromiseNoOverflow(1U)); + assert(next_with_shift.GetShift() + 1 == new_shift); + frontier = next_with_shift.GetNext(); } } @@ -1887,19 +1849,19 @@ class AutoHyperClockTable::ChainRewriteLock { // RAII wrap existing lock held (or end) explicit ChainRewriteLock(HandleImpl* h, RelaxedAtomic& /*yield_count*/, - uint64_t already_locked_or_end) + NextWithShift already_locked_or_end) : head_ptr_(&h->head_next_with_shift) { saved_head_ = already_locked_or_end; // already locked or end - assert(saved_head_ & HandleImpl::kHeadLocked); + assert(saved_head_.IsLocked()); } ~ChainRewriteLock() { if (!IsEnd()) { // Release lock - uint64_t old = head_ptr_->FetchAnd(~HandleImpl::kHeadLocked); - (void)old; - assert((old & HandleImpl::kNextEndFlags) == HandleImpl::kHeadLocked); + NextWithShift old; + head_ptr_->Apply(NextWithShift::LockedFlag::ClearTransform(), &old); + assert(old.IsLockedNotEnd()); } } @@ -1909,12 +1871,13 @@ class AutoHyperClockTable::ChainRewriteLock { } // Expected current state, assuming no parallel updates. - uint64_t GetSavedHead() const { return saved_head_; } + NextWithShift GetSavedHead() const { return saved_head_; } - bool CasUpdate(uint64_t next_with_shift, + bool CasUpdate(NextWithShift next_with_shift, RelaxedAtomic& yield_count) { - uint64_t new_head = next_with_shift | HandleImpl::kHeadLocked; - uint64_t expected = GetSavedHead(); + NextWithShift new_head = + next_with_shift.With(true); + NextWithShift expected = GetSavedHead(); bool success = head_ptr_->CasStrong(expected, new_head); if (success) { // Ensure IsEnd() is kept up-to-date, including for dtor @@ -1923,7 +1886,7 @@ class AutoHyperClockTable::ChainRewriteLock { // Parallel update to head, such as Insert() if (IsEnd()) { // Didn't previously hold a lock - if (HandleImpl::IsEnd(expected)) { + if (expected.IsEnd()) { // Still don't need to saved_head_ = expected; } else { @@ -1932,28 +1895,25 @@ class AutoHyperClockTable::ChainRewriteLock { } } else { // Parallel update must preserve our lock - assert((expected & HandleImpl::kNextEndFlags) == - HandleImpl::kHeadLocked); + assert(expected.IsLockedNotEnd()); saved_head_ = expected; } } return success; } - bool IsEnd() const { return HandleImpl::IsEnd(saved_head_); } + bool IsEnd() const { return saved_head_.IsEnd(); } private: void Acquire(RelaxedAtomic& yield_count) { for (;;) { // Acquire removal lock on the chain - uint64_t old_head = head_ptr_->FetchOr(HandleImpl::kHeadLocked); - if ((old_head & HandleImpl::kNextEndFlags) != HandleImpl::kHeadLocked) { + NextWithShift old_head; + head_ptr_->Apply(NextWithShift::LockedFlag::SetTransform(), &old_head, + &saved_head_); + if (!old_head.IsLockedNotEnd()) { // Either acquired the lock or lock not needed (end) - assert((old_head & HandleImpl::kNextEndFlags) == 0 || - (old_head & HandleImpl::kNextEndFlags) == - HandleImpl::kNextEndFlags); - - saved_head_ = old_head | HandleImpl::kHeadLocked; + assert(old_head.IsEnd() == old_head.IsLocked()); break; } // NOTE: one of the few yield-wait loops, which is rare enough in practice @@ -1964,16 +1924,18 @@ class AutoHyperClockTable::ChainRewriteLock { } } - AcqRelAtomic* head_ptr_; - uint64_t saved_head_; + BitFieldsAtomic* head_ptr_; + NextWithShift saved_head_; }; AutoHyperClockTable::AutoHyperClockTable( - size_t capacity, CacheMetadataChargePolicy metadata_charge_policy, + size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const Opts& opts) - : BaseClockTable(metadata_charge_policy, allocator, eviction_callback, + : BaseClockTable(capacity, strict_capacity_limit, opts.eviction_effort_cap, + metadata_charge_policy, allocator, eviction_callback, hash_seed), array_(MemMapping::AllocateLazyZeroed( sizeof(HandleImpl) * CalcMaxUsableLength(capacity, @@ -1985,6 +1947,11 @@ AutoHyperClockTable::AutoHyperClockTable( grow_frontier_(GetTableSize()), clock_pointer_mask_( BottomNBits(UINT64_MAX, LengthInfoToMinShift(length_info_.Load()))) { + if (array_.Get() == nullptr) { + fprintf(stderr, + "Anonymous mmap for RocksDB HyperClockCache failed. Aborting.\n"); + std::terminate(); + } if (metadata_charge_policy == CacheMetadataChargePolicy::kFullChargeCacheMetadata) { // NOTE: ignoring page boundaries for simplicity @@ -2013,9 +1980,9 @@ AutoHyperClockTable::AutoHyperClockTable( #endif if (major + i < used_length) { array_[i].head_next_with_shift.StoreRelaxed( - MakeNextWithShiftEnd(i, max_shift)); + NextWithShift::MakeEnd(i, max_shift)); array_[major + i].head_next_with_shift.StoreRelaxed( - MakeNextWithShiftEnd(major + i, max_shift)); + NextWithShift::MakeEnd(major + i, max_shift)); #ifndef NDEBUG // Extra invariant checking GetHomeIndexAndShift(length_info, i, &home, &shift); assert(home == i); @@ -2026,7 +1993,7 @@ AutoHyperClockTable::AutoHyperClockTable( #endif } else { array_[i].head_next_with_shift.StoreRelaxed( - MakeNextWithShiftEnd(i, min_shift)); + NextWithShift::MakeEnd(i, min_shift)); #ifndef NDEBUG // Extra invariant checking GetHomeIndexAndShift(length_info, i, &home, &shift); assert(home == i); @@ -2052,52 +2019,54 @@ AutoHyperClockTable::~AutoHyperClockTable() { HandleImpl::kUnusedMarker) { used_end++; } -#ifndef NDEBUG - for (size_t i = used_end; i < array_.Count(); i++) { - assert(array_[i].head_next_with_shift.LoadRelaxed() == 0); - assert(array_[i].chain_next_with_shift.LoadRelaxed() == 0); - assert(array_[i].meta.LoadRelaxed() == 0); - } + // This check can be extra expensive for a cache that is just created, + // maybe used for a small number of entries, as in a unit test, and then + // destroyed. Only do this in rare modes. REVISED: Don't scan the whole mmap, + // just a reasonable frontier past what we expect to have written. +#ifdef MUST_FREE_HEAP_ALLOCATIONS + for (size_t i = used_end; i < array_.Count() && i < used_end + 64U; i++) { + assert(array_[i].head_next_with_shift.LoadRelaxed() == + HandleImpl::kUnusedMarker); + assert(array_[i].chain_next_with_shift.LoadRelaxed() == + HandleImpl::kUnusedMarker); + assert(array_[i].meta.LoadRelaxed() == SlotMeta{}); + } +#endif // MUST_FREE_HEAP_ALLOCATIONS +#ifndef NDEBUG // Extra invariant checking std::vector was_populated(used_end); std::vector was_pointed_to(used_end); -#endif +#endif // !NDEBUG for (size_t i = 0; i < used_end; i++) { HandleImpl& h = array_[i]; - switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) { - case ClockHandle::kStateEmpty: - // noop - break; - case ClockHandle::kStateInvisible: // rare but possible - case ClockHandle::kStateVisible: - assert(GetRefcount(h.meta.LoadRelaxed()) == 0); - h.FreeData(allocator_); + SlotMeta meta = h.meta.LoadRelaxed(); + if (meta.IsShareable()) { + // NOTE: Reaching here invisible is rare but possible + assert(meta.GetRefcount() == 0); + h.FreeData(allocator_); #ifndef NDEBUG // Extra invariant checking - usage_.FetchSubRelaxed(h.total_charge); - occupancy_.FetchSubRelaxed(1U); - was_populated[i] = true; - if (!HandleImpl::IsEnd(h.chain_next_with_shift.LoadRelaxed())) { - assert((h.chain_next_with_shift.LoadRelaxed() & - HandleImpl::kHeadLocked) == 0); - size_t next = - GetNextFromNextWithShift(h.chain_next_with_shift.LoadRelaxed()); - assert(!was_pointed_to[next]); - was_pointed_to[next] = true; - } -#endif - break; - // otherwise - default: - assert(false); - break; + usage_.FetchSubRelaxed(h.total_charge); + occupancy_.FetchSubRelaxed(1U); + was_populated[i] = true; + if (!h.chain_next_with_shift.LoadRelaxed().IsEnd()) { + assert(!h.chain_next_with_shift.LoadRelaxed().IsLocked()); + size_t next = h.chain_next_with_shift.LoadRelaxed().GetNext(); + assert(!was_pointed_to[next]); + was_pointed_to[next] = true; + } +#endif // !NDEBUG + } else { + // Should be no transient "under construction" states unless a thread + // was killed or we are being destructed while another thread is still + // operating on the structure + assert(meta.IsEmpty()); } #ifndef NDEBUG // Extra invariant checking - if (!HandleImpl::IsEnd(h.head_next_with_shift.LoadRelaxed())) { - size_t next = - GetNextFromNextWithShift(h.head_next_with_shift.LoadRelaxed()); + if (!h.head_next_with_shift.LoadRelaxed().IsEnd()) { + size_t next = h.head_next_with_shift.LoadRelaxed().GetNext(); assert(!was_pointed_to[next]); was_pointed_to[next] = true; } -#endif +#endif // !NDEBUG } #ifndef NDEBUG // Extra invariant checking // This check is not perfect, but should detect most reasonable cases @@ -2110,7 +2079,7 @@ AutoHyperClockTable::~AutoHyperClockTable() { assert(!was_pointed_to[i]); } } -#endif +#endif // !NDEBUG // Metadata charging only follows the published table size assert(usage_.LoadRelaxed() == 0 || @@ -2208,10 +2177,10 @@ bool AutoHyperClockTable::Grow(InsertState& state) { // chain rewrite lock has been released. size_t old_old_home = BottomNBits(grow_home, old_shift - 1); for (;;) { - uint64_t old_old_head = array_[old_old_home].head_next_with_shift.Load(); - if (GetShiftFromNextWithShift(old_old_head) >= old_shift) { - if ((old_old_head & HandleImpl::kNextEndFlags) != - HandleImpl::kHeadLocked) { + NextWithShift old_old_head = + array_[old_old_home].head_next_with_shift.Load(); + if (old_old_head.GetShift() >= old_shift) { + if (!old_old_head.IsLockedNotEnd()) { break; } } @@ -2271,8 +2240,7 @@ void AutoHyperClockTable::CatchUpLengthInfoNoWait( if (published_usable_size < known_usable_grow_home) { int old_shift = FloorLog2(next_usable_size - 1); size_t old_home = BottomNBits(published_usable_size, old_shift); - int shift = GetShiftFromNextWithShift( - array_[old_home].head_next_with_shift.Load()); + int shift = array_[old_home].head_next_with_shift.Load().GetShift(); if (shift <= old_shift) { // Not ready break; @@ -2423,9 +2391,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, ChainRewriteLock zero_head_lock(&arr[old_home], yield_count_); // Used for locking the one chain below - uint64_t saved_one_head; + NextWithShift saved_one_head; // One head has not been written to - assert(arr[grow_home].head_next_with_shift.Load() == 0); + assert(arr[grow_home].head_next_with_shift.Load() == + HandleImpl::kUnusedMarker); // old_home will also the head of the new "zero chain" -- all entries in the // "from" chain whose next hash bit is 0. grow_home will be head of the new @@ -2447,7 +2416,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, assert(cur == SIZE_MAX); assert(chain_frontier_first == -1); - uint64_t next_with_shift = zero_head_lock.GetSavedHead(); + NextWithShift next_with_shift = zero_head_lock.GetSavedHead(); // Find a single representative for each target chain, or scan the whole // chain if some target chain has no representative. @@ -2460,16 +2429,16 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, assert((cur == SIZE_MAX) == (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX)); - assert(GetShiftFromNextWithShift(next_with_shift) == old_shift); + assert(next_with_shift.GetShift() == old_shift); // Check for end of original chain - if (HandleImpl::IsEnd(next_with_shift)) { + if (next_with_shift.IsEnd()) { cur = SIZE_MAX; break; } // next_with_shift is not End - cur = GetNextFromNextWithShift(next_with_shift); + cur = next_with_shift.GetNext(); if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) { // Entry for zero chain @@ -2508,10 +2477,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX)); // Always update one chain's head first (safe), and mark it as locked - saved_one_head = HandleImpl::kHeadLocked | - (one_chain_frontier != SIZE_MAX - ? MakeNextWithShift(one_chain_frontier, new_shift) - : MakeNextWithShiftEnd(grow_home, new_shift)); + saved_one_head = one_chain_frontier != SIZE_MAX + ? NextWithShift::Make(one_chain_frontier, new_shift) + : NextWithShift::MakeEnd(grow_home, new_shift); + saved_one_head.Set(true); arr[grow_home].head_next_with_shift.Store(saved_one_head); // Make sure length_info_ hasn't been updated too early, as we're about @@ -2521,8 +2490,8 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, // Try to set zero's head. if (zero_head_lock.CasUpdate( zero_chain_frontier != SIZE_MAX - ? MakeNextWithShift(zero_chain_frontier, new_shift) - : MakeNextWithShiftEnd(old_home, new_shift), + ? NextWithShift::Make(zero_chain_frontier, new_shift) + : NextWithShift::MakeEnd(old_home, new_shift), yield_count_)) { // Both heads successfully updated to new shift break; @@ -2556,10 +2525,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, size_t& other_frontier = chain_frontier_first != 0 ? /*&*/ zero_chain_frontier : /*&*/ one_chain_frontier; - uint64_t stop_before_or_new_tail = + NextWithShift stop_before_or_new_tail = other_frontier != SIZE_MAX - ? /*stop before*/ MakeNextWithShift(other_frontier, old_shift) - : /*new tail*/ MakeNextWithShiftEnd( + ? /*stop before*/ NextWithShift::Make(other_frontier, old_shift) + : /*new tail*/ NextWithShift::MakeEnd( chain_frontier_first == 0 ? old_home : grow_home, new_shift); UpgradeShiftsOnRange(arr, first_frontier, stop_before_or_new_tail, old_shift, new_shift); @@ -2585,20 +2554,19 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, ? /*&*/ zero_chain_frontier : /*&*/ one_chain_frontier; assert(cur != first_frontier); - assert(GetNextFromNextWithShift( - arr[first_frontier].chain_next_with_shift.Load()) == + assert(arr[first_frontier].chain_next_with_shift.Load().GetNext() == other_frontier); - uint64_t next_with_shift = arr[cur].chain_next_with_shift.Load(); + NextWithShift next_with_shift = arr[cur].chain_next_with_shift.Load(); // Check for end of original chain - if (HandleImpl::IsEnd(next_with_shift)) { + if (next_with_shift.IsEnd()) { // Can set upgraded tail on first chain - uint64_t first_new_tail = MakeNextWithShiftEnd( + NextWithShift first_new_tail = NextWithShift::MakeEnd( chain_frontier_first == 0 ? old_home : grow_home, new_shift); arr[first_frontier].chain_next_with_shift.Store(first_new_tail); // And upgrade remainder of other chain - uint64_t other_new_tail = MakeNextWithShiftEnd( + NextWithShift other_new_tail = NextWithShift::MakeEnd( chain_frontier_first != 0 ? old_home : grow_home, new_shift); UpgradeShiftsOnRange(arr, other_frontier, other_new_tail, old_shift, new_shift); @@ -2607,7 +2575,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, } // next_with_shift is not End - cur = GetNextFromNextWithShift(next_with_shift); + cur = next_with_shift.GetNext(); int target_chain; if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) { @@ -2620,7 +2588,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, } if (target_chain == chain_frontier_first) { // Found next entry to skip to on the first chain - uint64_t skip_to = MakeNextWithShift(cur, new_shift); + NextWithShift skip_to = NextWithShift::Make(cur, new_shift); arr[first_frontier].chain_next_with_shift.Store(skip_to); first_frontier = cur; // Upgrade other chain up to entry before that one @@ -2661,17 +2629,17 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, HandleImpl* const arr = array_.Get(); - uint64_t next_with_shift = rewrite_lock.GetSavedHead(); - assert(!HandleImpl::IsEnd(next_with_shift)); - int home_shift = GetShiftFromNextWithShift(next_with_shift); + NextWithShift next_with_shift = rewrite_lock.GetSavedHead(); + assert(!next_with_shift.IsEnd()); + int home_shift = next_with_shift.GetShift(); (void)home; (void)home_shift; - size_t next = GetNextFromNextWithShift(next_with_shift); + size_t next = next_with_shift.GetNext(); assert(next < array_.Count()); HandleImpl* h = &arr[next]; HandleImpl* prev_to_keep = nullptr; #ifndef NDEBUG - uint64_t prev_to_keep_next_with_shift = 0; + NextWithShift prev_to_keep_next_with_shift{}; #endif // Whether there are entries between h and prev_to_keep that should be // purged from the chain. @@ -2698,20 +2666,17 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, op_data->push_back(h); // Entries for eviction become purgeable purgeable = true; - assert((h->meta.Load() >> ClockHandle::kStateShift) == - ClockHandle::kStateConstruction); + assert(h->meta.Load().IsUnderConstruction()); } } else { (void)op_data; (void)data; - purgeable = ((h->meta.Load() >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit) == 0; + purgeable = !h->meta.Load().IsShareable(); } } if (purgeable) { - assert((h->meta.Load() >> ClockHandle::kStateShift) == - ClockHandle::kStateConstruction); + assert(h->meta.Load().IsUnderConstruction()); pending_purge = true; } else if (pending_purge) { if (prev_to_keep) { @@ -2729,13 +2694,13 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, // update any new entries just inserted in parallel. // Can simply restart (GetSavedHead() already updated from CAS failure). next_with_shift = rewrite_lock.GetSavedHead(); - assert(!HandleImpl::IsEnd(next_with_shift)); - next = GetNextFromNextWithShift(next_with_shift); + assert(!next_with_shift.IsEnd()); + next = next_with_shift.GetNext(); assert(next < array_.Count()); h = &arr[next]; pending_purge = false; assert(prev_to_keep == nullptr); - assert(GetShiftFromNextWithShift(next_with_shift) == home_shift); + assert(next_with_shift.GetShift() == home_shift); continue; } pending_purge = false; @@ -2757,13 +2722,13 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, } #endif - assert(GetShiftFromNextWithShift(next_with_shift) == home_shift); + assert(next_with_shift.GetShift() == home_shift); // Check for end marker - if (HandleImpl::IsEnd(next_with_shift)) { + if (next_with_shift.IsEnd()) { h = nullptr; } else { - next = GetNextFromNextWithShift(next_with_shift); + next = next_with_shift.GetNext(); assert(next < array_.Count()); h = &arr[next]; assert(h != prev_to_keep); @@ -2835,7 +2800,7 @@ void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home, // Ensure we are at the correct home for the shift in effect for the // chain head. for (;;) { - int shift = GetShiftFromNextWithShift(rewrite_lock.GetSavedHead()); + int shift = rewrite_lock.GetSavedHead().GetShift(); if (shift > home_shift) { // Found a newer shift at candidate head, which must apply to us. @@ -2871,7 +2836,7 @@ void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home, } AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert( - const ClockHandleBasicData& proto, uint64_t initial_countdown, + const ClockHandleBasicData& proto, uint32_t initial_countdown, bool take_ref, InsertState& state) { size_t home; int orig_home_shift; @@ -3031,14 +2996,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert( } // Now insert into chain using head pointer - uint64_t next_with_shift; + NextWithShift next_with_shift; int home_shift = orig_home_shift; // Might need to retry for (int i = 0;; ++i) { CHECK_TOO_MANY_ITERATIONS(i); next_with_shift = arr[home].head_next_with_shift.Load(); - int shift = GetShiftFromNextWithShift(next_with_shift); + int shift = next_with_shift.GetShift(); if (UNLIKELY(shift != home_shift)) { // NOTE: shift increases with table growth @@ -3065,15 +3030,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert( } // Values to update to - uint64_t head_next_with_shift = MakeNextWithShift(idx, home_shift); - uint64_t chain_next_with_shift = next_with_shift; + NextWithShift head_next_with_shift = NextWithShift::Make(idx, home_shift); + NextWithShift chain_next_with_shift = next_with_shift; // Preserve the locked state in head, without propagating to chain next // where it is meaningless (and not allowed) - if (UNLIKELY((next_with_shift & HandleImpl::kNextEndFlags) == - HandleImpl::kHeadLocked)) { - head_next_with_shift |= HandleImpl::kHeadLocked; - chain_next_with_shift &= ~HandleImpl::kHeadLocked; + if (UNLIKELY(next_with_shift.IsLockedNotEnd())) { + head_next_with_shift.Set(true); + chain_next_with_shift.Set(false); } arr[idx].chain_next_with_shift.Store(chain_next_with_shift); @@ -3142,9 +3106,9 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( // of a loop as possible. HandleImpl* const arr = array_.Get(); - uint64_t next_with_shift = arr[home].head_next_with_shift.LoadRelaxed(); - for (size_t i = 0; !HandleImpl::IsEnd(next_with_shift) && i < 10; ++i) { - HandleImpl* h = &arr[GetNextFromNextWithShift(next_with_shift)]; + NextWithShift next_with_shift = arr[home].head_next_with_shift.LoadRelaxed(); + for (size_t i = 0; !next_with_shift.IsEnd() && i < 10; ++i) { + HandleImpl* h = &arr[next_with_shift.IsEnd()]; // Attempt cheap key match without acquiring a read ref. This could give a // false positive, which is re-checked after acquiring read ref, or false // negative, which is re-checked in the full Lookup. Also, this is a @@ -3157,14 +3121,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( #endif if (probably_equal) { // Increment acquire counter for definitive check - uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); + auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1); + SlotMeta old_meta, new_meta; + h->meta.Apply(add_acquire, &old_meta, &new_meta); // Check if it's a referencable (sharable) entry - if (LIKELY(old_meta & (uint64_t{ClockHandle::kStateShareableBit} - << ClockHandle::kStateShift))) { - assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0); + if (LIKELY(old_meta.IsShareable())) { + assert(new_meta.GetRefcount() > 0); if (LIKELY(h->hashed_key == hashed_key) && - LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit} - << ClockHandle::kStateShift))) { + LIKELY(old_meta.IsVisible())) { return h; } else { Unref(*h); @@ -3189,7 +3153,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( // Read head or chain pointer next_with_shift = h ? h->chain_next_with_shift.Load() : arr[home].head_next_with_shift.Load(); - int shift = GetShiftFromNextWithShift(next_with_shift); + int shift = next_with_shift.GetShift(); // Make sure it's usable size_t effective_home = home; @@ -3243,10 +3207,10 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( } // Check for end marker - if (HandleImpl::IsEnd(next_with_shift)) { + if (next_with_shift.IsEnd()) { // To ensure we didn't miss anything in the chain, the end marker must // point back to the correct home. - if (LIKELY(GetNextFromNextWithShift(next_with_shift) == effective_home)) { + if (LIKELY(next_with_shift.GetNext() == effective_home)) { // Complete, clean iteration of the chain, not found. // Clean up. if (read_ref_on_chain) { @@ -3262,7 +3226,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( } // Follow the next and check for full key match, home match, or neither - h = &arr[GetNextFromNextWithShift(next_with_shift)]; + h = &arr[next_with_shift.GetNext()]; bool full_match_or_unknown = false; if (MatchAndRef(&hashed_key, *h, shift, effective_home, &full_match_or_unknown)) { @@ -3285,7 +3249,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( } // Update the hit bit if (eviction_callback_) { - h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift); + h->meta.ApplyRelaxed(SlotMeta::HitFlag::SetTransform()); } // All done. return h; @@ -3325,8 +3289,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( } void AutoHyperClockTable::Remove(HandleImpl* h) { - assert((h->meta.Load() >> ClockHandle::kStateShift) == - ClockHandle::kStateConstruction); + assert(h->meta.Load().IsUnderConstruction()); const HandleImpl& c_h = *h; PurgeImpl(&c_h.hashed_key); @@ -3334,26 +3297,23 @@ void AutoHyperClockTable::Remove(HandleImpl* h) { bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref, bool mark_invisible) { - uint64_t meta; - if (mark_invisible) { - // Set invisible - meta = h->meta.FetchAnd( - ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift)); - // To local variable also - meta &= - ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift); - } else { - meta = h->meta.Load(); - } + SlotMeta meta = h->meta.Load(); + assert(!holding_ref || meta.IsShareable()); - // Take ownership if no other refs + // Take ownership if no other refs, or set invisible if other refs exist (and + // mark_invisible is set). + SlotMeta construction_meta; + construction_meta.SetUnderConstruction(); do { - if (GetRefcount(meta) != uint64_t{holding_ref}) { + if (meta.GetRefcount() != uint32_t{holding_ref}) { // Not last ref at some point in time during this call + if (mark_invisible) { + // Set invisible + h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform()); + } return false; } - if ((meta & (uint64_t{ClockHandle::kStateShareableBit} - << ClockHandle::kStateShift)) == 0) { + if (!meta.IsShareable()) { // Someone else took ownership return false; } @@ -3361,8 +3321,7 @@ bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref, // another thread replaces this entry with another, reaches zero refs, and // then we end up erasing that other entry. That's an acceptable risk / // imprecision. - } while (!h->meta.CasWeak(meta, uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift)); + } while (!h->meta.CasWeak(meta, construction_meta)); // Took ownership // TODO? Delay freeing? h->FreeData(allocator_); @@ -3389,27 +3348,24 @@ bool AutoHyperClockTable::Release(HandleImpl* h, bool useful, // is needed) and Erase. We do this to avoid an extra atomic read of the // variable usage_. - uint64_t old_meta; + SlotMeta old_meta; if (useful) { // Increment release counter to indicate was used - old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement); + auto add_release = ReleaseCounter::PlusTransformPromiseNoOverflow(1); + h->meta.Apply(add_release, &old_meta); // Correct for possible (but rare) overflow CorrectNearOverflow(old_meta, h->meta); } else { // Decrement acquire counter to pretend it never happened - old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement); + auto sub_acquire = AcquireCounter::MinusTransformPromiseNoUnderflow(1); + h->meta.Apply(sub_acquire, &old_meta); } - assert((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit); + assert(old_meta.IsShareable()); // No underflow - assert(((old_meta >> ClockHandle::kAcquireCounterShift) & - ClockHandle::kCounterMask) != - ((old_meta >> ClockHandle::kReleaseCounterShift) & - ClockHandle::kCounterMask)); + assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter()); - if ((erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == - ClockHandle::kStateInvisible))) { + if ((erase_if_last_ref || UNLIKELY(old_meta.IsInvisible()))) { // FIXME: There's a chance here that another thread could replace this // entry and we end up erasing the wrong one. return TryEraseHandle(h, /*holding_ref=*/false, /*mark_invisible=*/false); @@ -3419,7 +3375,7 @@ bool AutoHyperClockTable::Release(HandleImpl* h, bool useful, } #ifndef NDEBUG -void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { +void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, uint32_t n) { if (n > 0) { // Do n-1 simple releases first TEST_ReleaseNMinus1(h, n); @@ -3449,27 +3405,26 @@ void AutoHyperClockTable::EraseUnRefEntries() { for (size_t i = 0; i < usable_size; i++) { HandleImpl& h = array_[i]; - uint64_t old_meta = h.meta.LoadRelaxed(); - if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} - << ClockHandle::kStateShift) && - GetRefcount(old_meta) == 0 && - h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift)) { - // Took ownership - h.FreeData(allocator_); - usage_.FetchSubRelaxed(h.total_charge); - // NOTE: could be more efficient with a dedicated variant of - // PurgeImpl, but this is not a common operation - Remove(&h); - MarkEmpty(h); - occupancy_.FetchSub(1U); + SlotMeta old_meta = h.meta.LoadRelaxed(); + if (old_meta.IsShareable() && old_meta.GetRefcount() == 0) { + SlotMeta construction_meta; + construction_meta.SetUnderConstruction(); + if (h.meta.CasStrong(old_meta, construction_meta)) { + // Took ownership + h.FreeData(allocator_); + usage_.FetchSubRelaxed(h.total_charge); + // NOTE: could be more efficient with a dedicated variant of + // PurgeImpl, but this is not a common operation + Remove(&h); + MarkEmpty(h); + occupancy_.FetchSub(1U); + } } } } void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, - EvictionData* data, - uint32_t eviction_effort_cap) { + EvictionData* data) { // precondition assert(requested_charge > 0); @@ -3561,7 +3516,7 @@ void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, return; } - if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) { + if (IsEvictionEffortExceeded(*data)) { eviction_effort_exceeded_count_.FetchAddRelaxed(1); return; } @@ -3579,7 +3534,7 @@ size_t AutoHyperClockTable::CalcMaxUsableLength( size_t num_slots = static_cast(capacity / min_avg_slot_charge + 0.999999); - const size_t slots_per_page = port::kPageSize / sizeof(HandleImpl); + const size_t slots_per_page = kPresumedPageSize / sizeof(HandleImpl); // Round up to page size return ((num_slots + slots_per_page - 1) / slots_per_page) * slots_per_page; @@ -3587,8 +3542,7 @@ size_t AutoHyperClockTable::CalcMaxUsableLength( namespace { bool IsHeadNonempty(const AutoHyperClockTable::HandleImpl& h) { - return !AutoHyperClockTable::HandleImpl::IsEnd( - h.head_next_with_shift.LoadRelaxed()); + return !h.head_next_with_shift.LoadRelaxed().IsEnd(); } bool IsEntryAtHome(const AutoHyperClockTable::HandleImpl& h, int shift, size_t home) { diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 2d5d0d9eef3c..efce8a69e352 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -9,8 +9,6 @@ #pragma once -#include -#include #include #include #include @@ -19,14 +17,10 @@ #include "cache/cache_key.h" #include "cache/sharded_cache.h" -#include "port/lang.h" -#include "port/malloc.h" #include "port/mmap.h" -#include "port/port.h" #include "rocksdb/cache.h" -#include "rocksdb/secondary_cache.h" #include "util/atomic.h" -#include "util/autovector.h" +#include "util/bit_fields.h" #include "util/math.h" namespace ROCKSDB_NAMESPACE { @@ -323,40 +317,89 @@ struct ClockHandle : public ClockHandleBasicData { // | acquire counter | release counter | hit bit | state marker | // ----------------------------------------------------------------------- - // For reading or updating counters in meta word. - static constexpr uint8_t kCounterNumBits = 30; - static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1; - - static constexpr uint8_t kAcquireCounterShift = 0; - static constexpr uint64_t kAcquireIncrement = uint64_t{1} - << kAcquireCounterShift; - static constexpr uint8_t kReleaseCounterShift = kCounterNumBits; - static constexpr uint64_t kReleaseIncrement = uint64_t{1} - << kReleaseCounterShift; - - // For setting the hit bit - static constexpr uint8_t kHitBitShift = 2U * kCounterNumBits; - static constexpr uint64_t kHitBitMask = uint64_t{1} << kHitBitShift; - - // For reading or updating the state marker in meta word - static constexpr uint8_t kStateShift = kHitBitShift + 1; - - // Bits contribution to state marker. - // Occupied means any state other than empty - static constexpr uint8_t kStateOccupiedBit = 0b100; - // Shareable means the entry is reference counted (visible or invisible) - // (only set if also occupied) - static constexpr uint8_t kStateShareableBit = 0b010; - // Visible is only set if also shareable - static constexpr uint8_t kStateVisibleBit = 0b001; - - // Complete state markers (not shifted into full word) - static constexpr uint8_t kStateEmpty = 0b000; - static constexpr uint8_t kStateConstruction = kStateOccupiedBit; - static constexpr uint8_t kStateInvisible = - kStateOccupiedBit | kStateShareableBit; - static constexpr uint8_t kStateVisible = - kStateOccupiedBit | kStateShareableBit | kStateVisibleBit; + struct SlotMeta : public BitFields { + // For reading or updating counters in meta word. + static constexpr uint8_t kCounterNumBits = 30; + // Number of times the a reference has been acquired (or attempted) + // since last reset by eviction processing + using AcquireCounter = + UnsignedBitField; + // Number of times the a reference has been released (or attempted) + // since last reset by eviction processing + using ReleaseCounter = + UnsignedBitField; + // Metadata bit in support of secondary cache + using HitFlag = BoolBitField; + // Occupied means any state other than empty + using OccupiedFlag = BoolBitField; + // Shareable means the entry is reference counted (visible or invisible) + // (only set if also occupied) + using ShareableFlag = BoolBitField; + // Visible is only set if also shareable (invisible can't be found by + // Lookup) + using VisibleFlag = BoolBitField; + + // Convenience functions + uint32_t GetAcquireCounter() const { return Get(); } + void SetAcquireCounter(uint32_t val) { Set(val); } + uint32_t GetReleaseCounter() const { return Get(); } + void SetReleaseCounter(uint32_t val) { Set(val); } + uint32_t GetRefcount() const { + return Get() - Get(); + } + bool GetHit() const { return Get(); } + void SetHit(bool val) { Set(val); } + + // Some distinct states for the various state flags + bool IsEmpty() const { + bool rv = !Get(); + if (rv) { + assert(!Get()); + assert(!Get()); + } + return rv; + } + + bool IsUnderConstruction() const { + bool rv = Get() && !Get(); + if (rv) { + assert(!Get()); + } + return rv; + } + void SetUnderConstruction() { + Set(true); + Set(false); + Set(false); + } + + bool IsShareable() const { return Get(); } + bool IsInvisible() const { + bool rv = Get() && !Get(); + if (rv) { + assert(Get()); + } + return rv; + } + void SetInvisible() { + Set(true); + Set(true); + Set(false); + } + + bool IsVisible() const { + bool rv = Get() && Get(); + if (rv) { + assert(Get()); + } + return rv; + } + void SetVisible() { + Set(true); + Set(true); + Set(true); + } + }; // Constants for initializing the countdown clock. (Countdown clock is only // in effect with zero refs, acquire counter == release counter, and in that @@ -370,7 +413,7 @@ struct ClockHandle : public ClockHandleBasicData { // TODO: make these coundown values tuning parameters for eviction? // See above. Mutable for read reference counting. - mutable AcqRelAtomic meta{}; + mutable BitFieldsAtomic meta{}; }; // struct ClockHandle class BaseClockTable { @@ -383,25 +426,20 @@ class BaseClockTable { int eviction_effort_cap; }; - BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy, + BaseClockTable(size_t capacity, bool strict_capacity_limit, + int eviction_effort_cap, + CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, - const uint32_t* hash_seed) - : metadata_charge_policy_(metadata_charge_policy), - allocator_(allocator), - eviction_callback_(*eviction_callback), - hash_seed_(*hash_seed) {} + const uint32_t* hash_seed); template typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto, - size_t capacity, - uint32_t eec_and_scl, bool allow_uncharged); template Status Insert(const ClockHandleBasicData& proto, - typename Table::HandleImpl** handle, Cache::Priority priority, - size_t capacity, uint32_t eec_and_scl); + typename Table::HandleImpl** handle, Cache::Priority priority); void Ref(ClockHandle& handle); @@ -411,6 +449,18 @@ class BaseClockTable { size_t GetStandaloneUsage() const { return standalone_usage_.LoadRelaxed(); } + size_t GetCapacity() const { return capacity_.LoadRelaxed(); } + + void SetCapacity(size_t capacity) { capacity_.StoreRelaxed(capacity); } + + void SetStrictCapacityLimit(bool strict_capacity_limit) { + if (strict_capacity_limit) { + eec_and_scl_.ApplyRelaxed(StrictCapacityLimit::SetTransform()); + } else { + eec_and_scl_.ApplyRelaxed(StrictCapacityLimit::ClearTransform()); + } + } + uint32_t GetHashSeed() const { return hash_seed_; } uint64_t GetYieldCount() const { return yield_count_.LoadRelaxed(); } @@ -427,11 +477,12 @@ class BaseClockTable { void TrackAndReleaseEvictedEntry(ClockHandle* h); + bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data) const; #ifndef NDEBUG // Acquire N references - void TEST_RefN(ClockHandle& handle, size_t n); + void TEST_RefN(ClockHandle& handle, uint32_t n); // Helper for TEST_ReleaseN - void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n); + void TEST_ReleaseNMinus1(ClockHandle* handle, uint32_t n); #endif private: // fns @@ -448,9 +499,8 @@ class BaseClockTable { // required, and the operation should fail if not possible. // NOTE: Otherwise, occupancy_ is not managed in this function template - Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity, + Status ChargeUsageMaybeEvictStrict(size_t total_charge, bool need_evict_for_occupancy, - uint32_t eviction_effort_cap, typename Table::InsertState& state); // Helper for updating `usage_` for new entry with given `total_charge` @@ -462,9 +512,8 @@ class BaseClockTable { // true, indicating success. // NOTE: occupancy_ is not managed in this function template - bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity, + bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, bool need_evict_for_occupancy, - uint32_t eviction_effort_cap, typename Table::InsertState& state); protected: // data @@ -489,13 +538,32 @@ class BaseClockTable { // TODO: is this separation needed if we don't do background evictions? ALIGN_AS(CACHE_LINE_SIZE) // Number of elements in the table. - AcqRelAtomic occupancy_{}; + Atomic occupancy_{}; // Memory usage by entries tracked by the cache (including standalone) - AcqRelAtomic usage_{}; + Atomic usage_{}; // Part of usage by standalone entries (not in table) - AcqRelAtomic standalone_usage_{}; + Atomic standalone_usage_{}; + + // Maximum total charge of all elements stored in the table. + // (Relaxed: eventual consistency/update is OK) + RelaxedAtomic capacity_; + + // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit + // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc. + struct EecAndScl : public BitFields { + uint32_t GetEffectiveEvictionEffortCap() const { + // Because setting strict_capacity_limit is supposed to imply infinite + // cap on eviction effort, we can let the bit for strict_capacity_limit + // in the upper-most bit position to used as part of the effective cap. + return underlying; + } + }; + using EvictionEffortCap = UnsignedBitField; + using StrictCapacityLimit = BoolBitField; + // (Relaxed: eventual consistency/update is OK) + RelaxedBitFieldsAtomic eec_and_scl_; ALIGN_AS(CACHE_LINE_SIZE) const CacheMetadataChargePolicy metadata_charge_policy_; @@ -551,7 +619,7 @@ class FixedHyperClockTable : public BaseClockTable { size_t estimated_value_size; }; - FixedHyperClockTable(size_t capacity, + FixedHyperClockTable(size_t capacity, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, @@ -567,14 +635,13 @@ class FixedHyperClockTable : public BaseClockTable { bool GrowIfNeeded(size_t new_occupancy, InsertState& state); HandleImpl* DoInsert(const ClockHandleBasicData& proto, - uint64_t initial_countdown, bool take_ref, + uint32_t initial_countdown, bool take_ref, InsertState& state); // Runs the clock eviction algorithm trying to reclaim at least // requested_charge. Returns how much is evicted, which could be less // if it appears impossible to evict the requested amount without blocking. - void Evict(size_t requested_charge, InsertState& state, EvictionData* data, - uint32_t eviction_effort_cap); + void Evict(size_t requested_charge, InsertState& state, EvictionData* data); HandleImpl* Lookup(const UniqueId64x2& hashed_key); @@ -596,7 +663,7 @@ class FixedHyperClockTable : public BaseClockTable { } // Release N references - void TEST_ReleaseN(HandleImpl* handle, size_t n); + void TEST_ReleaseN(HandleImpl* handle, uint32_t n); #endif // The load factor p is a real number in (0, 1) such that at all @@ -757,6 +824,7 @@ class AutoHyperClockTable : public BaseClockTable { // chain--specifically the next entry in the chain. // * The end of a chain is given a special "end" marker and refers back // to the head of the chain. + // These decorated pointers use the NextWithShift bit field struct below. // // Why do we need shift on each pointer? To make Lookup wait-free, we need // to be able to query a chain without missing anything, and preferably @@ -776,47 +844,63 @@ class AutoHyperClockTable : public BaseClockTable { // it is normal to see "under construction" entries on the chain, and it // is not safe to read their hashed key without either a read reference // on the entry or a rewrite lock on the chain. - - // Marker in a "with_shift" head pointer for some thread owning writes - // to the chain structure (except for inserts), but only if not an - // "end" pointer. Also called the "rewrite lock." - static constexpr uint64_t kHeadLocked = uint64_t{1} << 7; - - // Marker in a "with_shift" pointer for the end of a chain. Must also - // point back to the head of the chain (with end marker removed). - // Also includes the "locked" bit so that attempting to lock an empty - // chain has no effect (not needed, as the lock is only needed for - // removals). - static constexpr uint64_t kNextEndFlags = (uint64_t{1} << 6) | kHeadLocked; - - static inline bool IsEnd(uint64_t next_with_shift) { - // Assuming certain values never used, suffices to check this one bit - constexpr auto kCheckBit = kNextEndFlags ^ kHeadLocked; - return next_with_shift & kCheckBit; - } - - // Bottom bits to right shift away to get an array index from a - // "with_shift" pointer. - static constexpr int kNextShift = 8; - - // A bit mask for the "shift" associated with each "with_shift" pointer. - // Always bottommost bits. - static constexpr int kShiftMask = 63; + struct NextWithShift : public BitFields { + // The "shift" associated with this decorated pointer (see description + // above). + using Shift = UnsignedBitField; + // Marker for the end of a chain. Must also (a) point back to the head of + // the chain (with end marker removed), and (b) set the LockedFlag + // (below), so that attempting to lock an empty chain has no effect (not + // needed, as the lock is only needed for removals). + using EndFlag = BoolBitField; + // Marker that some thread owning writes to the chain structure (except + // for inserts), but only if not an "end" pointer. Also called the + // "rewrite lock." + using LockedFlag = BoolBitField; + // The "next" associated with this decorated pointer, which is an index + // into the table's array_ (see description above). + using Next = UnsignedBitField; + + bool IsLocked() const { return Get(); } + bool IsEnd() const { + // End flag should imply locked flag + assert(!Get() || Get()); + return Get(); + } + bool IsLockedNotEnd() const { + // NOTE: helping GCC to optimize this simpler code: + // return IsLocked() && !IsEnd(); + constexpr U kEndFlag = U{1} << EndFlag::kBitOffset; + constexpr U kLockedFlag = U{1} << LockedFlag::kBitOffset; + return (underlying & (kEndFlag | kLockedFlag)) == kLockedFlag; + } + auto GetNext() const { return Get(); } + auto GetShift() const { return Get(); } + + static NextWithShift Make(size_t next, int shift) { + return NextWithShift{}.With(next).With( + static_cast(shift)); + } + + static NextWithShift MakeEnd(size_t next, int shift) { + return Make(next, shift).With(true).With(true); + } + }; // A marker for head_next_with_shift that indicates this HandleImpl is // heap allocated (standalone) rather than in the table. - static constexpr uint64_t kStandaloneMarker = UINT64_MAX; + static constexpr NextWithShift kStandaloneMarker{UINT64_MAX}; // A marker for head_next_with_shift indicating the head is not yet part // of the usable table, or for chain_next_with_shift indicating that the // entry is not present or is not yet part of a chain (must not be // "shareable" state). - static constexpr uint64_t kUnusedMarker = 0; + static constexpr NextWithShift kUnusedMarker{0}; // See above. The head pointer is logically independent of the rest of // the entry, including the chain next pointer. - AcqRelAtomic head_next_with_shift{kUnusedMarker}; - AcqRelAtomic chain_next_with_shift{kUnusedMarker}; + BitFieldsAtomic head_next_with_shift{kUnusedMarker}; + BitFieldsAtomic chain_next_with_shift{kUnusedMarker}; // For supporting CreateStandalone and some fallback cases. inline bool IsStandalone() const { @@ -841,7 +925,7 @@ class AutoHyperClockTable : public BaseClockTable { size_t min_avg_value_size; }; - AutoHyperClockTable(size_t capacity, + AutoHyperClockTable(size_t capacity, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, @@ -862,14 +946,13 @@ class AutoHyperClockTable : public BaseClockTable { bool GrowIfNeeded(size_t new_occupancy, InsertState& state); HandleImpl* DoInsert(const ClockHandleBasicData& proto, - uint64_t initial_countdown, bool take_ref, + uint32_t initial_countdown, bool take_ref, InsertState& state); // Runs the clock eviction algorithm trying to reclaim at least // requested_charge. Returns how much is evicted, which could be less // if it appears impossible to evict the requested amount without blocking. - void Evict(size_t requested_charge, InsertState& state, EvictionData* data, - uint32_t eviction_effort_cap); + void Evict(size_t requested_charge, InsertState& state, EvictionData* data); HandleImpl* Lookup(const UniqueId64x2& hashed_key); @@ -891,7 +974,7 @@ class AutoHyperClockTable : public BaseClockTable { } // Release N references - void TEST_ReleaseN(HandleImpl* handle, size_t n); + void TEST_ReleaseN(HandleImpl* handle, uint32_t n); #endif // Maximum ratio of number of occupied slots to number of usable slots. The @@ -973,7 +1056,7 @@ class AutoHyperClockTable : public BaseClockTable { // To maximize parallelization of Grow() operations, this field is only // updated opportunistically after Grow() operations and in DoInsert() where // it is found to be out-of-date. See CatchUpLengthInfoNoWait(). - AcqRelAtomic length_info_; + Atomic length_info_; // An already-computed version of the usable length times the max load // factor. Could be slightly out of date but GrowIfNeeded()/Grow() handle @@ -1096,21 +1179,12 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { return table_.TEST_MutableOccupancyLimit(); } // Acquire/release N references - void TEST_RefN(HandleImpl* handle, size_t n); - void TEST_ReleaseN(HandleImpl* handle, size_t n); + void TEST_RefN(HandleImpl* handle, uint32_t n); + void TEST_ReleaseN(HandleImpl* handle, uint32_t n); #endif private: // data Table table_; - - // Maximum total charge of all elements stored in the table. - // (Relaxed: eventual consistency/update is OK) - RelaxedAtomic capacity_; - - // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit - // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc. - // (Relaxed: eventual consistency/update is OK) - RelaxedAtomic eec_and_scl_; }; // class ClockCacheShard template diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc index 4d3d0a2cddf7..d07a099ec872 100644 --- a/cache/compressed_secondary_cache.cc +++ b/cache/compressed_secondary_cache.cc @@ -16,6 +16,31 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { +namespace { +// Format of values in CompressedSecondaryCache: +// If enable_custom_split_merge: +// * A chain of CacheValueChunk representing the sequence of bytes for a tagged +// value. The overall length of the tagged value is determined by the chain +// of CacheValueChunks. +// If !enable_custom_split_merge: +// * A LengthPrefixedSlice (starts with varint64 size) of a tagged value. +// +// A tagged value has a 2-byte header before the "saved" or compressed block +// data: +// * 1 byte for "source" CacheTier indicating which tier is responsible for +// compression/decompression. +// * 1 byte for compression type which is generated/used by +// CompressedSecondaryCache iff source == CacheTier::kVolatileCompressedTier +// (original entry passed in was uncompressed). Otherwise, the compression +// type is preserved from the entry passed in. +constexpr uint32_t kTagSize = 2; + +// Size of tag + varint size prefix when applicable +uint32_t GetHeaderSize(size_t data_size, bool enable_split_merge) { + return (enable_split_merge ? 0 : VarintLength(kTagSize + data_size)) + + kTagSize; +} +} // namespace CompressedSecondaryCache::CompressedSecondaryCache( const CompressedSecondaryCacheOptions& opts) @@ -24,7 +49,13 @@ CompressedSecondaryCache::CompressedSecondaryCache( cache_res_mgr_(std::make_shared( std::make_shared>( cache_))), - disable_cache_(opts.capacity == 0) {} + disable_cache_(opts.capacity == 0) { + auto mgr = GetBuiltinV2CompressionManager(); + compressor_ = mgr->GetCompressor(cache_options_.compression_opts, + cache_options_.compression_type); + decompressor_ = + mgr->GetDecompressorOptimizeFor(cache_options_.compression_type); +} CompressedSecondaryCache::~CompressedSecondaryCache() = default; @@ -33,13 +64,9 @@ std::unique_ptr CompressedSecondaryCache::Lookup( Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase, Statistics* stats, bool& kept_in_sec_cache) { assert(helper); - // This is a minor optimization. Its ok to skip it in TSAN in order to - // avoid a false positive. -#ifndef __SANITIZE_THREAD__ - if (disable_cache_) { + if (disable_cache_.LoadRelaxed()) { return nullptr; } -#endif std::unique_ptr handle; kept_in_sec_cache = false; @@ -55,75 +82,58 @@ std::unique_ptr CompressedSecondaryCache::Lookup( return nullptr; } - CacheAllocationPtr* ptr{nullptr}; - CacheAllocationPtr merged_value; - size_t handle_value_charge{0}; - const char* data_ptr = nullptr; - CacheTier source = CacheTier::kVolatileCompressedTier; - CompressionType type = cache_options_.compression_type; + std::string merged_value; + Slice tagged_data; if (cache_options_.enable_custom_split_merge) { CacheValueChunk* value_chunk_ptr = - reinterpret_cast(handle_value); - merged_value = MergeChunksIntoValue(value_chunk_ptr, handle_value_charge); - ptr = &merged_value; - data_ptr = ptr->get(); + static_cast(handle_value); + merged_value = MergeChunksIntoValue(value_chunk_ptr); + tagged_data = Slice(merged_value); } else { - uint32_t type_32 = static_cast(type); - uint32_t source_32 = static_cast(source); - ptr = reinterpret_cast(handle_value); - handle_value_charge = cache_->GetCharge(lru_handle); - data_ptr = ptr->get(); - data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1, - static_cast(&type_32)); - type = static_cast(type_32); - data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1, - static_cast(&source_32)); - source = static_cast(source_32); - uint64_t data_size = 0; - data_ptr = GetVarint64Ptr(data_ptr, ptr->get() + handle_value_charge, - static_cast(&data_size)); - assert(handle_value_charge > data_size); - handle_value_charge = data_size; + tagged_data = GetLengthPrefixedSlice(static_cast(handle_value)); } - MemoryAllocator* allocator = cache_options_.memory_allocator.get(); - Status s; - Cache::ObjectPtr value{nullptr}; - size_t charge{0}; + auto source = lossless_cast(tagged_data[0]); + auto type = lossless_cast(tagged_data[1]); + + std::unique_ptr uncompressed; + Slice saved(tagged_data.data() + kTagSize, tagged_data.size() - kTagSize); if (source == CacheTier::kVolatileCompressedTier) { - if (cache_options_.compression_type == kNoCompression || - cache_options_.do_not_compress_roles.Contains(helper->role)) { - s = helper->create_cb(Slice(data_ptr, handle_value_charge), - kNoCompression, CacheTier::kVolatileTier, - create_context, allocator, &value, &charge); - } else { - UncompressionContext uncompression_context( - cache_options_.compression_type); - UncompressionInfo uncompression_info(uncompression_context, - UncompressionDict::GetEmptyDict(), - cache_options_.compression_type); - - size_t uncompressed_size{0}; - CacheAllocationPtr uncompressed = - UncompressData(uncompression_info, (char*)data_ptr, - handle_value_charge, &uncompressed_size, - cache_options_.compress_format_version, allocator); - - if (!uncompressed) { + if (type != kNoCompression) { + // TODO: can we do something to avoid yet another allocation? + Decompressor::Args args; + args.compressed_data = saved; + args.compression_type = type; + Status s = decompressor_->ExtractUncompressedSize(args); + assert(s.ok()); // in-memory data + if (s.ok()) { + uncompressed = std::make_unique(args.uncompressed_size); + s = decompressor_->DecompressBlock(args, uncompressed.get()); + assert(s.ok()); // in-memory data + } + if (!s.ok()) { cache_->Release(lru_handle, /*erase_if_last_ref=*/true); return nullptr; } - s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size), - kNoCompression, CacheTier::kVolatileTier, - create_context, allocator, &value, &charge); + saved = Slice(uncompressed.get(), args.uncompressed_size); + type = kNoCompression; + // Free temporary compressed data as early as we can. This could matter + // for unusually large blocks because we also have + // * Another compressed copy above (from lru_cache). + // * The uncompressed copy in `uncompressed`. + // * Another uncompressed copy in `result_value` below. + // Let's try to max out at 3 copies instead of 4. + merged_value = std::string(); } - } else { - // The item was not compressed by us. Let the helper create_cb - // uncompress it - s = helper->create_cb(Slice(data_ptr, handle_value_charge), type, source, - create_context, allocator, &value, &charge); + // Reduced as if it came from primary cache + source = CacheTier::kVolatileTier; } + Cache::ObjectPtr result_value = nullptr; + size_t result_charge = 0; + Status s = helper->create_cb(saved, type, source, create_context, + cache_options_.memory_allocator.get(), + &result_value, &result_charge); if (!s.ok()) { cache_->Release(lru_handle, /*erase_if_last_ref=*/true); return nullptr; @@ -141,7 +151,8 @@ std::unique_ptr CompressedSecondaryCache::Lookup( kept_in_sec_cache = true; cache_->Release(lru_handle, /*erase_if_last_ref=*/false); } - handle.reset(new CompressedSecondaryCacheResultHandle(value, charge)); + handle.reset( + new CompressedSecondaryCacheResultHandle(result_value, result_charge)); RecordTick(stats, COMPRESSED_SECONDARY_CACHE_HITS); return handle; } @@ -164,88 +175,111 @@ bool CompressedSecondaryCache::MaybeInsertDummy(const Slice& key) { Status CompressedSecondaryCache::InsertInternal( const Slice& key, Cache::ObjectPtr value, - const Cache::CacheItemHelper* helper, CompressionType type, + const Cache::CacheItemHelper* helper, CompressionType from_type, CacheTier source) { - if (source != CacheTier::kVolatileCompressedTier && - cache_options_.enable_custom_split_merge) { - // We don't support custom split/merge for the tiered case - return Status::OK(); - } - - auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge); - char header[20]; - char* payload = header; - payload = EncodeVarint32(payload, static_cast(type)); - payload = EncodeVarint32(payload, static_cast(source)); - size_t data_size = (*helper->size_cb)(value); - char* data_size_ptr = payload; - payload = EncodeVarint64(payload, data_size); - - size_t header_size = payload - header; - size_t total_size = data_size + header_size; - CacheAllocationPtr ptr = - AllocateBlock(total_size, cache_options_.memory_allocator.get()); - char* data_ptr = ptr.get() + header_size; - - Status s = (*helper->saveto_cb)(value, 0, data_size, data_ptr); + bool enable_split_merge = cache_options_.enable_custom_split_merge; + const Cache::CacheItemHelper* internal_helper = GetHelper(enable_split_merge); + + // TODO: variant of size_cb that also returns a pointer to the data if + // already available. Saves an allocation if we keep the compressed version. + const size_t data_size_original = (*helper->size_cb)(value); + + // Allocate enough memory for header/tag + original data because (a) we might + // not be attempting compression at all, and (b) we might keep the original if + // compression is insufficient. But we don't need the length prefix with + // enable_split_merge. TODO: be smarter with CacheValueChunk to save an + // allocation in the enable_split_merge case. + size_t header_size = GetHeaderSize(data_size_original, enable_split_merge); + CacheAllocationPtr allocation = AllocateBlock( + header_size + data_size_original, cache_options_.memory_allocator.get()); + char* data_ptr = allocation.get() + header_size; + Slice tagged_data(data_ptr - kTagSize, data_size_original + kTagSize); + assert(tagged_data.data() >= allocation.get()); + + Status s = (*helper->saveto_cb)(value, 0, data_size_original, data_ptr); if (!s.ok()) { return s; } - Slice val(data_ptr, data_size); - std::string compressed_val; - if (cache_options_.compression_type != kNoCompression && - type == kNoCompression && + std::unique_ptr tagged_compressed_data; + CompressionType to_type = kNoCompression; + if (compressor_ && from_type == kNoCompression && !cache_options_.do_not_compress_roles.Contains(helper->role)) { - PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, data_size); - CompressionContext compression_context(cache_options_.compression_type, - cache_options_.compression_opts); - uint64_t sample_for_compression{0}; - CompressionInfo compression_info( - cache_options_.compression_opts, compression_context, - CompressionDict::GetEmptyDict(), cache_options_.compression_type, - sample_for_compression); - - bool success = - CompressData(val, compression_info, - cache_options_.compress_format_version, &compressed_val); - - if (!success) { - return Status::Corruption("Error compressing value."); + assert(source == CacheTier::kVolatileCompressedTier); + + // TODO: consider malloc sizes for max acceptable compressed size + // Or maybe max_compressed_bytes_per_kb + size_t data_size_compressed = data_size_original - 1; + tagged_compressed_data = + std::make_unique(data_size_compressed + kTagSize); + s = compressor_->CompressBlock(Slice(data_ptr, data_size_original), + tagged_compressed_data.get() + kTagSize, + &data_size_compressed, &to_type, + nullptr /*working_area*/); + if (!s.ok()) { + return s; } - - val = Slice(compressed_val); - data_size = compressed_val.size(); - payload = EncodeVarint64(data_size_ptr, data_size); - header_size = payload - header; - total_size = header_size + data_size; - PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, data_size); - - if (!cache_options_.enable_custom_split_merge) { - ptr = AllocateBlock(total_size, cache_options_.memory_allocator.get()); - data_ptr = ptr.get() + header_size; - memcpy(data_ptr, compressed_val.data(), data_size); + PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, + data_size_original); + if (to_type == kNoCompression) { + // Compression rejected or otherwise aborted/failed + to_type = kNoCompression; + tagged_compressed_data.reset(); + // TODO: consider separate counters for rejected compressions + PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, + data_size_original); + } else { + PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, + data_size_compressed); + if (enable_split_merge) { + // Only need tagged_data for copying into CacheValueChunks. + tagged_data = Slice(tagged_compressed_data.get(), + data_size_compressed + kTagSize); + allocation.reset(); + } else { + // Replace allocation with compressed version, copied from string + header_size = GetHeaderSize(data_size_compressed, enable_split_merge); + allocation = AllocateBlock(header_size + data_size_compressed, + cache_options_.memory_allocator.get()); + data_ptr = allocation.get() + header_size; + // Ignore unpopulated tag on tagged_compressed_data; will only be + // populated on the new allocation. + std::memcpy(data_ptr, tagged_compressed_data.get() + kTagSize, + data_size_compressed); + tagged_data = + Slice(data_ptr - kTagSize, data_size_compressed + kTagSize); + assert(tagged_data.data() >= allocation.get()); + } } } PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1); - if (cache_options_.enable_custom_split_merge) { + + // Save the tag fields + const_cast(tagged_data.data())[0] = lossless_cast(source); + const_cast(tagged_data.data())[1] = lossless_cast( + source == CacheTier::kVolatileCompressedTier ? to_type : from_type); + + if (enable_split_merge) { size_t split_charge{0}; - CacheValueChunk* value_chunks_head = SplitValueIntoChunks( - val, cache_options_.compression_type, split_charge); - return cache_->Insert(key, value_chunks_head, internal_helper, - split_charge); + CacheValueChunk* value_chunks_head = + SplitValueIntoChunks(tagged_data, split_charge); + s = cache_->Insert(key, value_chunks_head, internal_helper, split_charge); + assert(s.ok()); // LRUCache::Insert() with handle==nullptr always OK } else { + // Save the size prefix + char* ptr = allocation.get(); + ptr = EncodeVarint64(ptr, tagged_data.size()); + assert(ptr == tagged_data.data()); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - size_t charge = malloc_usable_size(ptr.get()); + size_t charge = malloc_usable_size(allocation.get()); #else - size_t charge = total_size; + size_t charge = tagged_data.size(); #endif - std::memcpy(ptr.get(), header, header_size); - CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr)); - charge += sizeof(CacheAllocationPtr); - return cache_->Insert(key, buf, internal_helper, charge); + s = cache_->Insert(key, allocation.release(), internal_helper, charge); + assert(s.ok()); // LRUCache::Insert() with handle==nullptr always OK } + return Status::OK(); } Status CompressedSecondaryCache::Insert(const Slice& key, @@ -267,7 +301,17 @@ Status CompressedSecondaryCache::Insert(const Slice& key, Status CompressedSecondaryCache::InsertSaved( const Slice& key, const Slice& saved, CompressionType type = kNoCompression, CacheTier source = CacheTier::kVolatileTier) { + if (source == CacheTier::kVolatileCompressedTier) { + // Unexpected, would violate InsertInternal preconditions + assert(source != CacheTier::kVolatileCompressedTier); + return Status::OK(); + } if (type == kNoCompression) { + // Not currently supported (why?) + return Status::OK(); + } + if (cache_options_.enable_custom_split_merge) { + // We don't support custom split/merge for the tiered case (why?) return Status::OK(); } @@ -287,7 +331,7 @@ Status CompressedSecondaryCache::SetCapacity(size_t capacity) { MutexLock l(&capacity_mutex_); cache_options_.capacity = capacity; cache_->SetCapacity(capacity); - disable_cache_ = capacity == 0; + disable_cache_.StoreRelaxed(capacity == 0); return Status::OK(); } @@ -311,15 +355,17 @@ std::string CompressedSecondaryCache::GetPrintableOptions() const { const_cast(cache_options_.compression_opts)) .c_str()); ret.append(buffer); - snprintf(buffer, kBufferSize, " compress_format_version : %d\n", - cache_options_.compress_format_version); - ret.append(buffer); return ret; } +// FIXME: this could use a lot of attention, including: +// * Use allocator +// * We shouldn't be worse than non-split; be more pro-actively aware of +// internal fragmentation +// * Consider a unified object/chunk structure that may or may not split +// * Optimize size overhead of chunks CompressedSecondaryCache::CacheValueChunk* CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value, - CompressionType compression_type, size_t& charge) { assert(!value.empty()); const char* src_ptr = value.data(); @@ -340,15 +386,14 @@ CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value, // size, or there is no compression. if (upper == malloc_bin_sizes_.begin() || upper == malloc_bin_sizes_.end() || - *upper - predicted_chunk_size < malloc_bin_sizes_.front() || - compression_type == kNoCompression) { + *upper - predicted_chunk_size < malloc_bin_sizes_.front()) { tmp_size = predicted_chunk_size; } else { tmp_size = *(--upper); } CacheValueChunk* new_chunk = - reinterpret_cast(new char[tmp_size]); + static_cast(static_cast(new char[tmp_size])); current_chunk->next = new_chunk; current_chunk = current_chunk->next; actual_chunk_size = tmp_size - sizeof(CacheValueChunk) + 1; @@ -363,28 +408,24 @@ CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value, return dummy_head.next; } -CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue( - const void* chunks_head, size_t& charge) { - const CacheValueChunk* head = - reinterpret_cast(chunks_head); +std::string CompressedSecondaryCache::MergeChunksIntoValue( + const CacheValueChunk* head) { const CacheValueChunk* current_chunk = head; - charge = 0; + size_t total_size = 0; while (current_chunk != nullptr) { - charge += current_chunk->size; + total_size += current_chunk->size; current_chunk = current_chunk->next; } - CacheAllocationPtr ptr = - AllocateBlock(charge, cache_options_.memory_allocator.get()); + std::string result; + result.reserve(total_size); current_chunk = head; - size_t pos{0}; while (current_chunk != nullptr) { - memcpy(ptr.get() + pos, current_chunk->data, current_chunk->size); - pos += current_chunk->size; + result.append(current_chunk->data, current_chunk->size); current_chunk = current_chunk->next; } - - return ptr; + assert(result.size() == total_size); + return result; } const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper( @@ -398,16 +439,16 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper( CacheValueChunk* tmp_chunk = chunks_head; chunks_head = chunks_head->next; tmp_chunk->Free(); - obj = nullptr; } }}; return &kHelper; } else { static const Cache::CacheItemHelper kHelper{ CacheEntryRole::kMisc, - [](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) { - delete static_cast(obj); - obj = nullptr; + [](Cache::ObjectPtr obj, MemoryAllocator* alloc) { + if (obj != nullptr) { + CacheAllocationDeleter{alloc}(static_cast(obj)); + } }}; return &kHelper; } @@ -418,12 +459,7 @@ size_t CompressedSecondaryCache::TEST_GetCharge(const Slice& key) { if (lru_handle == nullptr) { return 0; } - size_t charge = cache_->GetCharge(lru_handle); - if (cache_->Value(lru_handle) != nullptr && - !cache_options_.enable_custom_split_merge) { - charge -= 10; - } cache_->Release(lru_handle, /*erase_if_last_ref=*/false); return charge; } diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h index 45eab656e44f..52b3d84b6dda 100644 --- a/cache/compressed_secondary_cache.h +++ b/cache/compressed_secondary_cache.h @@ -10,13 +10,12 @@ #include #include "cache/cache_reservation_manager.h" -#include "cache/lru_cache.h" #include "memory/memory_allocator_impl.h" +#include "rocksdb/advanced_compression.h" #include "rocksdb/secondary_cache.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" -#include "util/compression.h" -#include "util/mutexlock.h" +#include "util/atomic.h" namespace ROCKSDB_NAMESPACE { @@ -124,14 +123,9 @@ class CompressedSecondaryCache : public SecondaryCache { // Split value into chunks to better fit into jemalloc bins. The chunks // are stored in CacheValueChunk and extra charge is needed for each chunk, // so the cache charge is recalculated here. - CacheValueChunk* SplitValueIntoChunks(const Slice& value, - CompressionType compression_type, - size_t& charge); + CacheValueChunk* SplitValueIntoChunks(const Slice& value, size_t& charge); - // After merging chunks, the extra charge for each chunk is removed, so - // the charge is recalculated. - CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head, - size_t& charge); + std::string MergeChunksIntoValue(const CacheValueChunk* head); bool MaybeInsertDummy(const Slice& key); @@ -145,9 +139,11 @@ class CompressedSecondaryCache : public SecondaryCache { const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const; std::shared_ptr cache_; CompressedSecondaryCacheOptions cache_options_; + std::unique_ptr compressor_; + std::shared_ptr decompressor_; mutable port::Mutex capacity_mutex_; std::shared_ptr cache_res_mgr_; - bool disable_cache_; + RelaxedAtomic disable_cache_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc index df319390eedb..845df62f72c0 100644 --- a/cache/compressed_secondary_cache_test.cc +++ b/cache/compressed_secondary_cache_test.cc @@ -24,6 +24,14 @@ namespace ROCKSDB_NAMESPACE { using secondary_cache_test_util::GetTestingCacheTypes; using secondary_cache_test_util::WithCacheType; +// Read and reset a statistic +template +T Pop(T& var) { + T ret = var; + var = T(); + return ret; +} + // 16 bytes for HCC compatibility const std::string key0 = "____ ____key0"; const std::string key1 = "____ ____key1"; @@ -51,7 +59,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, Random rnd(301); // Insert and Lookup the item k1 for the first time. - std::string str1(rnd.RandomString(1000)); + std::string str1 = test::CompressibleString(&rnd, 0.5, 1000); TestItem item1(str1.data(), str1.length()); // A dummy handle is inserted if the item is inserted for the first time. ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); @@ -68,7 +76,14 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); - ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), 1000); + if (sec_cache_is_compressed) { + ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), str1.length() / 4); + ASSERT_LT(comp_sec_cache->TEST_GetCharge(key1), str1.length() * 3 / 4); + } else { + ASSERT_GE(comp_sec_cache->TEST_GetCharge(key1), str1.length()); + // NOTE: split-merge is worse (1048 vs. 1024) + ASSERT_LE(comp_sec_cache->TEST_GetCharge(key1), 1048U); + } std::unique_ptr handle1_2 = sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true, @@ -76,10 +91,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_NE(handle1_2, nullptr); ASSERT_FALSE(kept_in_sec_cache); if (sec_cache_is_compressed) { - ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, - 1000); - ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, - 1007); + ASSERT_EQ( + Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes), + str1.length()); + ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes, + str1.length() * 3 / 4); + ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes), + str1.length() / 4); } else { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); @@ -97,7 +115,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_EQ(handle1_3, nullptr); // Insert and Lookup the item k2. - std::string str2(rnd.RandomString(1000)); + std::string str2 = test::CompressibleString(&rnd, 0.5, 1017); TestItem item2(str2.data(), str2.length()); ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2); @@ -109,10 +127,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2); if (sec_cache_is_compressed) { - ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, - 2000); - ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, - 2014); + ASSERT_EQ( + Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes), + str2.length()); + ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes, + str2.length() * 3 / 4); + ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes), + str2.length() / 4); } else { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); @@ -126,9 +147,48 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_NE(val2, nullptr); ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0); + // Release handles std::vector handles = {handle1_2.get(), handle2_2.get()}; sec_cache->WaitAll(handles); + handle1_2.reset(); + handle2_2.reset(); + + // Insert and Lookup a non-compressible item k3. + std::string str3 = rnd.RandomBinaryString(480); + TestItem item3(str3.data(), str3.length()); + ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelper(), false)); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3); + std::unique_ptr handle3_1 = + sec_cache->Lookup(key3, GetHelper(), this, true, /*advise_erase=*/false, + /*stats=*/nullptr, kept_in_sec_cache); + ASSERT_EQ(handle3_1, nullptr); + + ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelper(), false)); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 3); + if (sec_cache_is_compressed) { + // TODO: consider a compression rejected stat? + ASSERT_EQ( + Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes), + str3.length()); + ASSERT_EQ(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes), + str3.length()); + } else { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); + } + + std::unique_ptr handle3_2 = + sec_cache->Lookup(key3, GetHelper(), this, true, /*advise_erase=*/false, + /*stats=*/nullptr, kept_in_sec_cache); + ASSERT_NE(handle3_2, nullptr); + std::unique_ptr val3 = + std::unique_ptr(static_cast(handle3_2->Value())); + ASSERT_NE(val3, nullptr); + ASSERT_EQ(memcmp(val3->Buf(), item3.Buf(), item3.Size()), 0); + + EXPECT_GE(comp_sec_cache->TEST_GetCharge(key3), str3.length()); + EXPECT_LE(comp_sec_cache->TEST_GetCharge(key3), 512); sec_cache.reset(); } @@ -178,8 +238,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test, secondary_cache_opts.compression_type = CompressionType::kNoCompression; } - secondary_cache_opts.capacity = 1100; + secondary_cache_opts.capacity = 1400; secondary_cache_opts.num_shard_bits = 0; + secondary_cache_opts.strict_capacity_limit = true; std::shared_ptr sec_cache = NewCompressedSecondaryCache(secondary_cache_opts); @@ -193,7 +254,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); // Insert and Lookup the second item. - std::string str2(rnd.RandomString(200)); + std::string str2(rnd.RandomString(500)); TestItem item2(str2.data(), str2.length()); // Insert a dummy handle, k1 is not evicted. ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); @@ -201,16 +262,23 @@ class CompressedSecondaryCacheTestBase : public testing::Test, std::unique_ptr handle1 = sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false, /*stats=*/nullptr, kept_in_sec_cache); - ASSERT_EQ(handle1, nullptr); + ASSERT_NE(handle1, nullptr); + std::unique_ptr val1{static_cast(handle1->Value())}; + ASSERT_NE(val1, nullptr); + ASSERT_EQ(val1->ToString(), str1); + handle1.reset(); // Insert k2 and k1 is evicted. ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); + handle1 = + sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false, + /*stats=*/nullptr, kept_in_sec_cache); + ASSERT_EQ(handle1, nullptr); std::unique_ptr handle2 = sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false, /*stats=*/nullptr, kept_in_sec_cache); ASSERT_NE(handle2, nullptr); - std::unique_ptr val2 = - std::unique_ptr(static_cast(handle2->Value())); + std::unique_ptr val2{static_cast(handle2->Value())}; ASSERT_NE(val2, nullptr); ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0); @@ -232,7 +300,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, // Save Fails. std::string str3 = rnd.RandomString(10); TestItem item3(str3.data(), str3.length()); - // The Status is OK because a dummy handle is inserted. + // The first Status is OK because a dummy handle is inserted. ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelperFail(), false)); ASSERT_NOK(sec_cache->Insert(key3, &item3, GetHelperFail(), false)); @@ -265,11 +333,11 @@ class CompressedSecondaryCacheTestBase : public testing::Test, get_perf_context()->Reset(); Random rnd(301); - std::string str1 = rnd.RandomString(1001); + std::string str1 = test::CompressibleString(&rnd, 0.5, 1001); auto item1_1 = new TestItem(str1.data(), str1.length()); ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length())); - std::string str2 = rnd.RandomString(1012); + std::string str2 = test::CompressibleString(&rnd, 0.5, 1012); auto item2_1 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's dummy item. @@ -278,7 +346,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); - std::string str3 = rnd.RandomString(1024); + std::string str3 = test::CompressibleString(&rnd, 0.5, 1024); auto item3_1 = new TestItem(str3.data(), str3.length()); // After this Insert, primary cache contains k3 and secondary cache contains // k1's dummy item and k2's dummy item. @@ -297,10 +365,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); if (sec_cache_is_compressed) { - ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, + ASSERT_EQ( + Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes), + str1.length()); + ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes, str1.length()); - ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, - 1008); + ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes), + str1.length() / 10); } else { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); @@ -312,10 +383,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test, ASSERT_OK(cache->Insert(key3, item3_2, GetHelper(), str3.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2); if (sec_cache_is_compressed) { - ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, - str1.length() + str2.length()); - ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, - 2027); + ASSERT_EQ( + Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes), + str2.length()); + ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes, + str2.length()); + ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes), + str2.length() / 10); } else { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); @@ -641,8 +715,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test, size_t str_size{8500}; std::string str = rnd.RandomString(static_cast(str_size)); size_t charge{0}; - CacheValueChunk* chunks_head = - sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge); + CacheValueChunk* chunks_head = sec_cache->SplitValueIntoChunks(str, charge); ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1)); CacheValueChunk* current_chunk = chunks_head; @@ -688,12 +761,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test, std::unique_ptr sec_cache = std::make_unique( CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0)); - size_t charge{0}; - CacheAllocationPtr value = - sec_cache->MergeChunksIntoValue(chunks_head, charge); - ASSERT_EQ(charge, size1 + size2 + size3); - std::string value_str{value.get(), charge}; - ASSERT_EQ(strcmp(value_str.data(), str.data()), 0); + std::string value_str = sec_cache->MergeChunksIntoValue(chunks_head); + ASSERT_EQ(value_str.size(), size1 + size2 + size3); + ASSERT_EQ(value_str, str); while (chunks_head != nullptr) { CacheValueChunk* tmp_chunk = chunks_head; @@ -725,15 +795,12 @@ class CompressedSecondaryCacheTestBase : public testing::Test, size_t str_size{8500}; std::string str = rnd.RandomString(static_cast(str_size)); size_t charge{0}; - CacheValueChunk* chunks_head = - sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge); + CacheValueChunk* chunks_head = sec_cache->SplitValueIntoChunks(str, charge); ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1)); - CacheAllocationPtr value = - sec_cache->MergeChunksIntoValue(chunks_head, charge); - ASSERT_EQ(charge, str_size); - std::string value_str{value.get(), charge}; - ASSERT_EQ(strcmp(value_str.data(), str.data()), 0); + std::string value_str = sec_cache->MergeChunksIntoValue(chunks_head); + ASSERT_EQ(value_str.size(), str_size); + ASSERT_EQ(value_str, str); sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr); } @@ -789,8 +856,7 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) { if (LZ4_Supported()) { sec_cache_uri = "compressed_secondary_cache://" - "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;" - "compress_format_version=2"; + "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression"; } else { ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); sec_cache_uri = @@ -821,7 +887,7 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, sec_cache_uri = "compressed_secondary_cache://" "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;" - "compress_format_version=2;enable_custom_split_merge=true"; + "enable_custom_split_merge=true"; } else { ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); sec_cache_uri = @@ -896,8 +962,8 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) { std::shared_ptr sec_cache = NewCompressedSecondaryCache(opts); - // Fixed seed to ensure consistent compressibility (doesn't compress) - std::string junk(Random(301).RandomString(1000)); + Random rnd(301); + std::string junk = test::CompressibleString(&rnd, 0.5, 1000); for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) { CacheEntryRole role = static_cast(i); @@ -930,9 +996,11 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) { sec_cache_is_compressed_ && !do_not_compress.Contains(role); if (compressed) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, - 1000); - ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, - 1007); + junk.length()); + ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes, + junk.length() * 3 / 4); + ASSERT_GT(get_perf_context()->compressed_sec_cache_compressed_bytes, + junk.length() / 4); } else { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 7a1f18ed6f53..c9b4393dd274 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -1405,9 +1405,9 @@ TEST_P(BasicSecondaryCacheTest, SaveFailTest) { TestItem* item1 = new TestItem(str1.data(), str1.length()); ASSERT_OK(cache->Insert(k1.AsSlice(), item1, GetHelperFail(), str1.length())); std::string str2 = rnd.RandomString(1020); + ASSERT_EQ(secondary_cache->num_inserts(), 0u); TestItem* item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to NVM - ASSERT_EQ(secondary_cache->num_inserts(), 0u); ASSERT_OK(cache->Insert(k2.AsSlice(), item2, GetHelperFail(), str2.length())); ASSERT_EQ(secondary_cache->num_inserts(), 1u); @@ -1503,7 +1503,7 @@ TEST_P(BasicSecondaryCacheTest, FullCapacityTest) { /*context*/ this, Cache::Priority::LOW); ASSERT_EQ(handle1, nullptr); - // k1 promotion can fail with strict_capacit_limit=true, but Lookup still + // k1 promotion can fail with strict_capacity_limit=true, but Lookup still // succeeds using a standalone handle handle1 = cache->Lookup(k1.AsSlice(), GetHelper(), /*context*/ this, Cache::Priority::LOW); @@ -1680,7 +1680,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { // After Flush is successful, RocksDB will do the paranoid check for the new // SST file. Meta blocks are always cached in the block cache and they // will not be evicted. When block_2 is cache miss and read out, it is - // inserted to the block cache. Thefore, block_1 is evicted from block + // inserted to the block cache. Therefore, block_1 is evicted from block // cache and successfully inserted to the secondary cache. Here are 2 // lookups in the secondary cache for block_1 and block_2. ASSERT_EQ(secondary_cache->num_inserts(), 1u); @@ -1721,7 +1721,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { v = Get(Key(0)); ASSERT_EQ(1007, v.size()); // This Get needs to access block_1, since block_1 is not in block cache - // there is one econdary cache lookup. Then, block_1 is cached in the + // there is one secondary cache lookup. Then, block_1 is cached in the // block cache. ASSERT_EQ(secondary_cache->num_inserts(), 2u); ASSERT_EQ(secondary_cache->num_lookups(), 5u); @@ -1785,7 +1785,7 @@ TEST_P(DBSecondaryCacheTest, NoSecondaryCacheInsertion) { std::string v = Get(Key(0)); ASSERT_EQ(1000, v.size()); // Since the block cache is large enough, all the blocks are cached. we - // do not need to lookup the seondary cache. + // do not need to lookup the secondary cache. ASSERT_EQ(secondary_cache->num_inserts(), 0u); ASSERT_EQ(secondary_cache->num_lookups(), 2u); @@ -2150,7 +2150,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { ASSERT_OK(Flush()); Compact("a", "z"); - // do th eread for all the key value pairs, so all the blocks should be in + // do the read for all the key value pairs, so all the blocks should be in // cache uint32_t start_insert = cache->GetInsertCount(); uint32_t start_lookup = cache->GetLookupcount(); @@ -2179,7 +2179,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { &cache_dumper); ASSERT_OK(s); std::vector db_list; - db_list.push_back(db_); + db_list.push_back(db_.get()); s = cache_dumper->SetDumpFilter(db_list); ASSERT_OK(s); s = cache_dumper->DumpCacheEntriesToWriter(); @@ -2263,11 +2263,11 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { options.env = fault_env_.get(); std::string dbname1 = test::PerThreadDBPath("db_1"); ASSERT_OK(DestroyDB(dbname1, options)); - DB* db1 = nullptr; + std::unique_ptr db1; ASSERT_OK(DB::Open(options, dbname1, &db1)); std::string dbname2 = test::PerThreadDBPath("db_2"); ASSERT_OK(DestroyDB(dbname2, options)); - DB* db2 = nullptr; + std::unique_ptr db2; ASSERT_OK(DB::Open(options, dbname2, &db2)); fault_fs_->SetFailGetUniqueId(true); @@ -2335,7 +2335,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { &cache_dumper); ASSERT_OK(s); std::vector db_list; - db_list.push_back(db1); + db_list.push_back(db1.get()); s = cache_dumper->SetDumpFilter(db_list); ASSERT_OK(s); s = cache_dumper->DumpCacheEntriesToWriter(); @@ -2377,7 +2377,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { ASSERT_OK(s); ASSERT_OK(db1->Close()); - delete db1; + db1.reset(); ASSERT_OK(DB::Open(options, dbname1, &db1)); // After load, we do the Get again. To validate the cache, we do not allow any @@ -2406,8 +2406,8 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { ASSERT_EQ(256, static_cast(block_lookup)); fault_fs_->SetFailGetUniqueId(false); fault_fs_->SetFilesystemActive(true); - delete db1; - delete db2; + db1.reset(); + db2.reset(); ASSERT_OK(DestroyDB(dbname1, options)); ASSERT_OK(DestroyDB(dbname2, options)); } @@ -2464,7 +2464,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) { std::string v = Get(Key(0)); ASSERT_EQ(1007, v.size()); - // Check the data in first block. Cache miss, direclty read from SST file. + // Check the data in first block. Cache miss, directly read from SST file. ASSERT_EQ(secondary_cache->num_inserts(), 0u); ASSERT_EQ(secondary_cache->num_lookups(), 0u); @@ -2598,7 +2598,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) { } // Two DB test. We create 2 DBs sharing the same block cache and secondary -// cache. We diable the secondary cache option for DB2. +// cache. We disable the secondary cache option for DB2. TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { if (IsHyperClock()) { ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors"); @@ -2619,11 +2619,11 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { options.paranoid_file_checks = true; std::string dbname1 = test::PerThreadDBPath("db_t_1"); ASSERT_OK(DestroyDB(dbname1, options)); - DB* db1 = nullptr; + std::unique_ptr db1; ASSERT_OK(DB::Open(options, dbname1, &db1)); std::string dbname2 = test::PerThreadDBPath("db_t_2"); ASSERT_OK(DestroyDB(dbname2, options)); - DB* db2 = nullptr; + std::unique_ptr db2; Options options2 = options; options2.lowest_used_cache_tier = CacheTier::kVolatileTier; ASSERT_OK(DB::Open(options2, dbname2, &db2)); @@ -2700,8 +2700,8 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { fault_fs_->SetFailGetUniqueId(false); fault_fs_->SetFilesystemActive(true); - delete db1; - delete db2; + db1.reset(); + db2.reset(); ASSERT_OK(DestroyDB(dbname1, options)); ASSERT_OK(DestroyDB(dbname2, options)); } diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc index 57a77bc7fcb0..c02e31227308 100644 --- a/cache/secondary_cache_adapter.cc +++ b/cache/secondary_cache_adapter.cc @@ -33,7 +33,7 @@ const char* kTieredCacheName = "TieredCache"; // proportionally across the primary/secondary caches. // // The primary block cache is initially sized to the sum of the primary cache -// budget + teh secondary cache budget, as follows - +// budget + the secondary cache budget, as follows - // |--------- Primary Cache Configured Capacity -----------| // |---Secondary Cache Budget----|----Primary Cache Budget-----| // @@ -51,7 +51,7 @@ const char* kTieredCacheName = "TieredCache"; // placeholder is counted against the primary cache. To compensate and count // a portion of it against the secondary cache, the secondary cache Deflate() // method is called to shrink it. Since the Deflate() causes the secondary -// actual usage to shrink, it is refelcted here by releasing an equal amount +// actual usage to shrink, it is reflected here by releasing an equal amount // from the pri_cache_res_ reservation. The Deflate() in the secondary cache // can be, but is not required to be, implemented using its own cache // reservation manager. @@ -72,7 +72,7 @@ const char* kTieredCacheName = "TieredCache"; // reservation is increased by an equal amount. // // Another way of implementing this would have been to simply split the user -// reservation into primary and seconary components. However, this would +// reservation into primary and secondary components. However, this would // require allocating a structure to track the associated secondary cache // reservation, which adds some complexity and overhead. // @@ -121,7 +121,14 @@ CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() { assert(s.ok()); assert(placeholder_usage_ == 0); assert(reserved_usage_ == 0); - assert(pri_cache_res_->GetTotalMemoryUsed() == sec_capacity); + if (pri_cache_res_->GetTotalMemoryUsed() != sec_capacity) { + fprintf(stdout, + "~CacheWithSecondaryAdapter: Primary cache reservation: " + "%zu, Secondary cache capacity: %zu, " + "Secondary cache reserved: %zu\n", + pri_cache_res_->GetTotalMemoryUsed(), sec_capacity, + sec_reserved_); + } } #endif // NDEBUG } @@ -479,12 +486,10 @@ const char* CacheWithSecondaryAdapter::Name() const { // as well. At the moment, we don't have a good way of handling the case // where the new capacity < total cache reservations. void CacheWithSecondaryAdapter::SetCapacity(size_t capacity) { - size_t sec_capacity = static_cast( - capacity * (distribute_cache_res_ ? sec_cache_res_ratio_ : 0.0)); - size_t old_sec_capacity = 0; - if (distribute_cache_res_) { MutexLock m(&cache_res_mutex_); + size_t sec_capacity = static_cast(capacity * sec_cache_res_ratio_); + size_t old_sec_capacity = 0; Status s = secondary_cache_->GetCapacity(old_sec_capacity); if (!s.ok()) { @@ -579,7 +584,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio( size_t pri_capacity = target_->GetCapacity(); size_t sec_capacity = static_cast(pri_capacity * compressed_secondary_ratio); - size_t old_sec_capacity; + size_t old_sec_capacity = 0; Status s = secondary_cache_->GetCapacity(old_sec_capacity); if (!s.ok()) { return s; @@ -603,6 +608,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio( // cache utilization (increase in capacity - increase in share of cache // reservation) // 3. Increase secondary cache capacity + assert(new_sec_reserved >= sec_reserved_); s = secondary_cache_->Deflate(new_sec_reserved - sec_reserved_); assert(s.ok()); s = pri_cache_res_->UpdateCacheReservation( @@ -615,7 +621,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio( } else { // We're shrinking the ratio. Try to avoid unnecessary evictions - // 1. Lower the secondary cache capacity - // 2. Decrease pri_cache_res_ reservation to relect lower secondary + // 2. Decrease pri_cache_res_ reservation to reflect lower secondary // cache utilization (decrease in capacity - decrease in share of cache // reservations) // 3. Inflate the secondary cache to give it back the reduction in its diff --git a/ccache_msvc_compiler.bat b/ccache_msvc_compiler.bat new file mode 100644 index 000000000000..9501ec592bc4 --- /dev/null +++ b/ccache_msvc_compiler.bat @@ -0,0 +1 @@ +ccache.exe cl.exe %* diff --git a/claude_md/add_option.md b/claude_md/add_option.md new file mode 100644 index 000000000000..77caa1dbeeeb --- /dev/null +++ b/claude_md/add_option.md @@ -0,0 +1,512 @@ +# Adding New Options to RocksDB Public API + +This document provides guidance on how to add new options to RocksDB's public API. There are two main categories of options: + +1. **Standard Column Family Options** (Options/DBOptions/AdvancedColumnFamilyOptions) +2. **BlockBasedTableOptions** (options specific to block-based table format) + +## Overview of Files to Modify + +### For Standard Column Family Options + +| File | Purpose | +|------|---------| +| `include/rocksdb/advanced_options.h` | Define the option with documentation | +| `include/rocksdb/options.h` | Add reference in related option groups if needed | +| `options/cf_options.h` | Add to `MutableCFOptions` or `ImmutableCFOptions` struct | +| `options/cf_options.cc` | Register option for serialization/deserialization and logging | +| `options/options_helper.cc` | Add to `UpdateColumnFamilyOptions()` for mutable options | +| `options/options_settable_test.cc` | Add to test string for option parsing | +| `db_stress_tool/db_stress_common.h` | Declare gflag | +| `db_stress_tool/db_stress_gflags.cc` | Define gflag with default value | +| `db_stress_tool/db_stress_test_base.cc` | Apply flag to options | +| `tools/db_bench_tool.cc` | Add flag definition and apply to options | +| `tools/db_crashtest.py` | Add randomized values for stress testing | +| `unreleased_history/new_features/` | Add release note markdown file | + +### For BlockBasedTableOptions + +| File | Purpose | +|------|---------| +| `include/rocksdb/table.h` | Define the option in `BlockBasedTableOptions` struct | +| `table/block_based/block_based_table_factory.cc` | Register for serialization, validation, and printing | +| `options/options_settable_test.cc` | Add to `BlockBasedTableOptionsAllFieldsSettable` test | +| `options/options_test.cc` | Add to `MutableCFOptions` test if applicable | +| `db_stress_tool/db_stress_common.h` | Declare gflag | +| `db_stress_tool/db_stress_gflags.cc` | Define gflag | +| `db_stress_tool/db_stress_test_base.cc` | Apply flag to `block_based_options` | +| `tools/db_bench_tool.cc` | Add flag definition and apply to `block_based_options` | +| `tools/db_crashtest.py` | Add randomized values | +| `java/src/main/java/org/rocksdb/BlockBasedTableConfig.java` | Java API | +| `java/rocksjni/portal.h` | JNI portal for Java bindings | +| `java/rocksjni/table.cc` | JNI implementation | +| `java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java` | Java unit test | + +--- + +## Pattern 1: Adding a Standard Column Family Option + +Example reference: commit `94e65a2e0b4f817aa4bfa4c96cdf867e7980d7bc` (memtable_veirfy_per_key_checksum_on_seek) + +### Step 1: Define the Option in Public Header + +**File: `include/rocksdb/advanced_options.h`** + +Add the option with documentation in `AdvancedColumnFamilyOptions` struct: + +```cpp +// Enables additional integrity checks during seek. +// Specifically, for skiplist-based memtables, key checksum validation could +// be enabled during seek optionally. This is helpful to detect corrupted +// memtable keys during reads. Enabling this feature incurs a performance +// overhead due to additional key checksum validation during memtable seek +// operation. +// This option depends on memtable_protection_bytes_per_key to be non zero. +// If memtable_protection_bytes_per_key is zero, no validation is performed. +bool memtable_veirfy_per_key_checksum_on_seek = false; +``` + +### Step 2: Add to Internal Options Structs + +**File: `options/cf_options.h`** + +Add to `MutableCFOptions` struct (or `ImmutableCFOptions` for immutable options): + +```cpp +// In MutableCFOptions constructor from Options: +memtable_veirfy_per_key_checksum_on_seek( + options.memtable_veirfy_per_key_checksum_on_seek), + +// In MutableCFOptions default constructor: +memtable_veirfy_per_key_checksum_on_seek(false), + +// In MutableCFOptions struct member declarations: +bool memtable_veirfy_per_key_checksum_on_seek; +``` + +### Step 3: Register for Serialization/Deserialization + +**File: `options/cf_options.cc`** + +Add to the options type info map for serialization: + +```cpp +{"memtable_veirfy_per_key_checksum_on_seek", + {offsetof(struct MutableCFOptions, + memtable_veirfy_per_key_checksum_on_seek), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +``` + +Add logging in `MutableCFOptions::Dump()`: + +```cpp +ROCKS_LOG_INFO(log, "memtable_veirfy_per_key_checksum_on_seek: %d", + memtable_veirfy_per_key_checksum_on_seek); +``` + +### Step 4: Update Options Helper + +**File: `options/options_helper.cc`** + +Add to `UpdateColumnFamilyOptions()`: + +```cpp +cf_opts->memtable_veirfy_per_key_checksum_on_seek = + moptions.memtable_veirfy_per_key_checksum_on_seek; +``` + +### Step 5: Add to Options Settable Test + +**File: `options/options_settable_test.cc`** + +Add to the test string in `ColumnFamilyOptionsAllFieldsSettable`: + +```cpp +"memtable_veirfy_per_key_checksum_on_seek=1;" +``` + +### Step 6: Add db_stress Support + +**File: `db_stress_tool/db_stress_common.h`** + +```cpp +DECLARE_bool(memtable_veirfy_per_key_checksum_on_seek); +``` + +**File: `db_stress_tool/db_stress_gflags.cc`** + +```cpp +DEFINE_bool( + memtable_veirfy_per_key_checksum_on_seek, + ROCKSDB_NAMESPACE::Options().memtable_veirfy_per_key_checksum_on_seek, + "Sets CF option memtable_veirfy_per_key_checksum_on_seek."); +``` + +**File: `db_stress_tool/db_stress_test_base.cc`** + +```cpp +options.memtable_veirfy_per_key_checksum_on_seek = + FLAGS_memtable_veirfy_per_key_checksum_on_seek; +``` + +### Step 7: Add db_bench Support + +**File: `tools/db_bench_tool.cc`** + +```cpp +// Flag definition (near related flags): +DEFINE_bool(memtable_veirfy_per_key_checksum_on_seek, false, + "Sets CF option memtable_veirfy_per_key_checksum_on_seek"); + +// Apply flag to options (in InitializeOptionsFromFlags or similar): +options.memtable_veirfy_per_key_checksum_on_seek = + FLAGS_memtable_veirfy_per_key_checksum_on_seek; +``` + +### Step 8: Add Crash Test Support + +**File: `tools/db_crashtest.py`** + +```python +"memtable_veirfy_per_key_checksum_on_seek": lambda: random.choice([0] * 7 + [1]), +``` + +Also add constraint handling in `finalize_and_sanitize()` if needed: + +```python +# only skip list memtable representation supports paranoid memory checks +if dest_params.get("memtablerep") != "skip_list": + dest_params["memtable_veirfy_per_key_checksum_on_seek"] = 0 +``` + +### Step 9: Add Release Note + +**File: `unreleased_history/new_features/.md`** + +```markdown +A new flag memtable_veirfy_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation. +``` + +--- + +## Pattern 2: Adding a BlockBasedTableOptions Option + +Example reference: commit `742741b175c5f238374c1714f9db3340d49de569` (super_block_alignment_size) + +### Step 1: Define the Option in Public Header + +**File: `include/rocksdb/table.h`** + +Add to `BlockBasedTableOptions` struct with documentation: + +```cpp +// Align data blocks on super block alignment. Avoid a data block split across +// super block boundaries. Works with/without compression. +// +// Here a "super block" refers to an aligned unit of underlying Filesystem +// storage for which there is an extra cost when a random read involves two +// such super blocks instead of just one. Configuring that size here suggests +// inserting padding in the SST file to avoid a single SST block splitting +// across two super blocks. Only power-of-two sizes are supported. See also +// super_block_alignment_space_overhead_ratio. Default to 0, which means super +// block alignment is disabled. +size_t super_block_alignment_size = 0; + +// This option controls the storage space overhead of super block alignment. +// It is used to calculate the max padding size allowed for super block +// alignment. It is calculated in this way. If super_block_alignment_size is +// 2MB, and super_block_alignment_overhead_ratio is 128, then the max padding +// size allowed for super block alignment is 2MB / 128 = 16KB. +// Note that, when it is set to 0, super block alignment is disabled. +size_t super_block_alignment_space_overhead_ratio = 128; +``` + +### Step 2: Register for Serialization in Table Factory + +**File: `table/block_based/block_based_table_factory.cc`** + +Add to the type info map: + +```cpp +{"super_block_alignment_size", + {offsetof(struct BlockBasedTableOptions, super_block_alignment_size), + OptionType::kSizeT, OptionVerificationType::kNormal}}, +{"super_block_alignment_space_overhead_ratio", + {offsetof(struct BlockBasedTableOptions, + super_block_alignment_space_overhead_ratio), + OptionType::kSizeT, OptionVerificationType::kNormal}}, +``` + +Add validation in `ValidateOptions()`: + +```cpp +if ((table_options_.super_block_alignment_size & + (table_options_.super_block_alignment_size - 1))) { + return Status::InvalidArgument( + "Super Block alignment requested but super block alignment size is not " + "a power of 2"); +} +if (table_options_.super_block_alignment_size > + std::numeric_limits::max()) { + return Status::InvalidArgument( + "Super block alignment size exceeds maximum number (4GiB) allowed"); +} +``` + +Add printing in `GetPrintableOptions()`: + +```cpp +snprintf(buffer, kBufferSize, + " super_block_alignment_size: %" ROCKSDB_PRIszt "\n", + table_options_.super_block_alignment_size); +ret.append(buffer); +``` + +### Step 3: Add to Options Settable Test + +**File: `options/options_settable_test.cc`** + +Add to `BlockBasedTableOptionsAllFieldsSettable` test: + +```cpp +"super_block_alignment_size=65536;" +"super_block_alignment_space_overhead_ratio=4096;" +``` + +### Step 4: Add to Options Test + +**File: `options/options_test.cc`** + +```cpp +ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, cf_opts, + "block_based_table_factory.super_block_alignment_size=65536; " + "block_based_table_factory.super_block_alignment_space_overhead_ratio=4096;", + &cf_opts)); +ASSERT_EQ(bbto->super_block_alignment_size, 65536); +ASSERT_EQ(bbto->super_block_alignment_space_overhead_ratio, 4096); +``` + +### Step 5: Add db_stress Support + +**File: `db_stress_tool/db_stress_common.h`** + +```cpp +DECLARE_uint64(super_block_alignment_size); +DECLARE_uint64(super_block_alignment_space_overhead_ratio); +``` + +**File: `db_stress_tool/db_stress_gflags.cc`** + +```cpp +DEFINE_uint64( + super_block_alignment_size, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size, + "BlockBasedTableOptions.super_block_alignment_size"); + +DEFINE_uint64( + super_block_alignment_space_overhead_ratio, + ROCKSDB_NAMESPACE::BlockBasedTableOptions() + .super_block_alignment_space_overhead_ratio, + "BlockBasedTableOptions.super_block_alignment_space_overhead_ratio"); +``` + +**File: `db_stress_tool/db_stress_test_base.cc`** + +```cpp +block_based_options.super_block_alignment_size = + fLU64::FLAGS_super_block_alignment_size; +block_based_options.super_block_alignment_space_overhead_ratio = + fLU64::FLAGS_super_block_alignment_space_overhead_ratio; +``` + +### Step 6: Add db_bench Support + +**File: `tools/db_bench_tool.cc`** + +```cpp +// Flag definitions: +DEFINE_uint64( + super_block_alignment_size, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size, + "Configure super block size"); + +DEFINE_uint64(super_block_alignment_space_overhead_ratio, + ROCKSDB_NAMESPACE::BlockBasedTableOptions() + .super_block_alignment_space_overhead_ratio, + "Configure space overhead for super block alignment"); + +// Apply to block_based_options (in the block where other options are set): +block_based_options.super_block_alignment_size = FLAGS_super_block_alignment_size; +block_based_options.super_block_alignment_space_overhead_ratio = + FLAGS_super_block_alignment_space_overhead_ratio; +``` + +### Step 7: Add Crash Test Support + +**File: `tools/db_crashtest.py`** + +```python +"super_block_alignment_size": lambda: random.choice( + [0, 128 * 1024, 512 * 1024, 2 * 1024 * 1024] +), +"super_block_alignment_space_overhead_ratio": lambda: random.choice([0, 32, 4096]), +``` + +### Step 8: Add Java API Support + +**File: `java/src/main/java/org/rocksdb/BlockBasedTableConfig.java`** + +Add getter and setter methods: + +```java +/** + * Get the super block alignment size. + * + * @return the super block alignment size. + */ +public long superBlockAlignmentSize() { + return superBlockAlignmentSize; +} + +/** + * Set the super block alignment size. + * When set to 0, super block alignment is disabled. + * + * @param superBlockAlignmentSize the super block alignment size. + * + * @return the reference to the current option. + */ +public BlockBasedTableConfig setSuperBlockAlignmentSize(final long superBlockAlignmentSize) { + this.superBlockAlignmentSize = superBlockAlignmentSize; + return this; +} +``` + +Add member variable: + +```java +private long superBlockAlignmentSize; +``` + +Update constructor and native method signature. + +**File: `java/rocksjni/portal.h`** + +Update `GetMethodID` signature and add fields to Java object construction. + +**File: `java/rocksjni/table.cc`** + +Add parameters to JNI function and apply to options. + +**File: `java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java`** + +Add unit tests: + +```java +@Test +public void superBlockAlignmentSize() { + final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setSuperBlockAlignmentSize(1024 * 1024); + assertThat(blockBasedTableConfig.superBlockAlignmentSize()).isEqualTo(1024 * 1024); +} +``` + +--- + +## Pattern 3: Adding C API for Existing Option + +Example reference: commit `429b36c22d76403d275dd0e6877b08d4cea2bc90` (block_align C API) + +If an option already exists but needs C API support: + +**File: `db/c.cc`** + +```cpp +void rocksdb_block_based_options_set_block_align( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.block_align = v; +} +``` + +**File: `include/rocksdb/c.h`** + +```cpp +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_align( + rocksdb_block_based_table_options_t*, unsigned char); +``` + +--- + +## Unit Testing Guidelines + +### For Standard Options + +Add tests in appropriate test files (e.g., `db/db_memtable_test.cc`, `db/db_options_test.cc`): + +```cpp +TEST_F(DBMemTableTest, YourOptionTest) { + Options options; + options.your_new_option = true; + Reopen(options); + // Test the behavior +} +``` + +### For BlockBasedTableOptions + +Add tests in `db/db_flush_test.cc`, `table/block_based/block_based_table_reader_test.cc`, or `table/table_test.cc`: + +```cpp +TEST_P(DBFlushYourFeatureTest, YourFeature) { + Options options; + BlockBasedTableOptions block_options; + block_options.your_new_option = some_value; + options.table_factory.reset(NewBlockBasedTableFactory(block_options)); + + ASSERT_OK(options.table_factory->ValidateOptions( + DBOptions(options), ColumnFamilyOptions(options))); + + Reopen(options); + // Test the behavior +} +``` + +--- + +## Option Type Reference + +Common option types used in serialization: + +| OptionType | C++ Type | Example | +|------------|----------|---------| +| `kBoolean` | `bool` | `paranoid_memory_checks` | +| `kInt` | `int` | `max_write_buffer_number` | +| `kInt32T` | `int32_t` | `level0_file_num_compaction_trigger` | +| `kUInt32T` | `uint32_t` | `memtable_protection_bytes_per_key` | +| `kUInt64T` | `uint64_t` | `target_file_size_base` | +| `kSizeT` | `size_t` | `block_size` | +| `kDouble` | `double` | `compression_ratio` | +| `kString` | `std::string` | `db_log_dir` | + +--- + +## Checklist Summary + +- [ ] Public header file with option definition and documentation +- [ ] Internal options struct (MutableCFOptions or ImmutableCFOptions) +- [ ] Options serialization/deserialization registration +- [ ] Options logging in Dump() method +- [ ] UpdateColumnFamilyOptions() for mutable options +- [ ] options_settable_test.cc +- [ ] db_stress_common.h (DECLARE) +- [ ] db_stress_gflags.cc (DEFINE) +- [ ] db_stress_test_base.cc (apply flag) +- [ ] db_bench_tool.cc (DEFINE and apply) +- [ ] db_crashtest.py (randomized values) +- [ ] Unit tests +- [ ] unreleased_history markdown file +- [ ] Java API (for BlockBasedTableOptions) +- [ ] C API (if needed) + diff --git a/claude_md/add_public_api.md b/claude_md/add_public_api.md new file mode 100644 index 000000000000..684b89faeba5 --- /dev/null +++ b/claude_md/add_public_api.md @@ -0,0 +1,504 @@ +# RocksDB API Development Guide + +This document provides guidance for adding new public APIs to RocksDB, following the established patterns used by existing APIs like `CompactRange`. + +## API Layer Architecture + +RocksDB exposes public APIs through multiple layers. Users can access RocksDB through any of the three public APIs: C++ headers, C headers, or Java bindings. + +Here is an example for public header db.h: + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Level 1: Public APIs (User Entry Points) │ +├───────────────────────┬─────────────────────────┬───────────────────────────┤ +│ C++ Public API │ C API Bindings │ Java/JNI API │ +│ include/rocksdb/db.h │ include/rocksdb/c.h │ java/src/.../RocksDB.java │ +│ include/rocksdb/*.h │ │ java/src/.../*.java │ +└───────────────────────┴────────────┬────────────┴───────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Level 2: C++ Implementation (Internal Core) │ +│ db/db_impl/db_impl*.cc, db/c.cc, java/rocksjni/*.cc │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## Step-by-Step Guide: Adding a New Public API + +### Step 1: Define the C++ Public Interface + +**File:** `include/rocksdb/db.h` + +Add the virtual method declaration in the `DB` class: + +\`\`\`cpp +// Pure virtual - must be implemented by DBImpl +virtual Status YourNewAPI(const YourAPIOptions& options, + ColumnFamilyHandle* column_family, + /* other params */) = 0; + +// Convenience overload for default column family +virtual Status YourNewAPI(const YourAPIOptions& options, + /* other params */) { + return YourNewAPI(options, DefaultColumnFamily(), /* other params */); +} +\`\`\` + +**Key Patterns:** +- Use `Status` return type for error handling +- Use `OptSlice` to avoid unnecessary levels of indirection and use of raw pointers. +- Use `ColumnFamilyHandle*` for column family support +- Provide convenience overloads for the default column family + +### Step 2: Define Options Struct (If Needed) + +**File:** `include/rocksdb/options.h` + +If your API has multiple configuration options, define an options struct: + +\`\`\`cpp +struct YourAPIOptions { + // Document each option with clear comments + bool some_boolean_option = false; + + // Default value explanation + int some_int_option = -1; + + // Pointer options require careful lifetime management + std::atomic* canceled = nullptr; + + // Enum options for multi-choice settings + YourEnumType some_enum = YourEnumType::kDefault; +}; +\`\`\` + +**Key Patterns:** +- Use sensible default values specified inline (e.g., `= false`, `= -1`) +- Do NOT redundantly document the default value in comments; instead, document the rationale (why this default), historical context, and how different values are interpreted +- Group related options logically +- Consider thread-safety for pointer options + +### Step 3: Implement in DBImpl + +**Header:** `db/db_impl/db_impl.h` + +\`\`\`cpp +using DB::YourNewAPI; +Status YourNewAPI(const YourAPIOptions& options, + ColumnFamilyHandle* column_family, + /* other params */) override; + +// Private internal implementation if needed +Status YourNewAPIInternal(const YourAPIOptions& options, + ColumnFamilyHandle* column_family, + /* other params */); +\`\`\` + +**Implementation:** `db/db_impl/db_impl_.cc` + +Choose the appropriate implementation file based on functionality: +- `db_impl_compaction_flush.cc` - Compaction and flush operations +- `db_impl_write.cc` - Write operations +- `db_impl_open.cc` - DB opening/closing +- `db_impl_files.cc` - File operations +- `db_impl.cc` - General operations + +\`\`\`cpp +Status DBImpl::YourNewAPI(const YourAPIOptions& options, + ColumnFamilyHandle* column_family, + /* other params */) { + // 1. Input validation + if (/* invalid input */) { + return Status::InvalidArgument("Error message"); + } + + // 2. Check for cancellation/abort conditions + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + // 3. Get column family data + auto cfh = static_cast(column_family); + auto cfd = cfh->cfd(); + + // 4. Core implementation logic + // ... + + return Status::OK(); +} +\`\`\` + +### Step 4: Handle Special DB Types + +**StackableDB (Wrapper DBs):** +**File:** `include/rocksdb/utilities/stackable_db.h` + +\`\`\`cpp +using DB::YourNewAPI; +Status YourNewAPI(const YourAPIOptions& options, + ColumnFamilyHandle* column_family, + /* other params */) override { + return db_->YourNewAPI(options, column_family, /* other params */); +} +\`\`\` + +**Secondary DB (Read-Only):** +**File:** `db/db_impl/db_impl_secondary.h` + +\`\`\`cpp +using DBImpl::YourNewAPI; +Status YourNewAPI(const YourAPIOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + /* other params */) override { + return Status::NotSupported("Not supported in secondary DB"); +} +\`\`\` + +**CompactedDB (Read-Only):** +**File:** `db/db_impl/compacted_db_impl.h` + +\`\`\`cpp +using DBImpl::YourNewAPI; +Status YourNewAPI(const YourAPIOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + /* other params */) override { + return Status::NotSupported("Not supported for read-only DB"); +} +\`\`\` + +### Step 5: Add C API Bindings + +**Header:** `include/rocksdb/c.h` + +\`\`\`c +// Basic version +extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api( + rocksdb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + +// Column family version +extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len); + +// With options and error handling +extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api_opt( + rocksdb_t* db, rocksdb_your_api_options_t* opt, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len, + char** errptr); +\`\`\` + +**Implementation:** `db/c.cc` + +\`\`\`cpp +void rocksdb_your_new_api(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len) { + Slice a, b; + db->rep->YourNewAPI( + YourAPIOptions(), // Default options + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_your_new_api_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->YourNewAPI( + YourAPIOptions(), + column_family->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} +\`\`\` + +**If you have options, also add:** + +\`\`\`cpp +// Options struct wrapper +struct rocksdb_your_api_options_t { + YourAPIOptions rep; +}; + +rocksdb_your_api_options_t* rocksdb_your_api_options_create() { + return new rocksdb_your_api_options_t; +} + +void rocksdb_your_api_options_destroy(rocksdb_your_api_options_t* opt) { + delete opt; +} + +void rocksdb_your_api_options_set_some_option( + rocksdb_your_api_options_t* opt, unsigned char value) { + opt->rep.some_boolean_option = value; +} +\`\`\` + +### Step 6: Add Java Bindings + +**Java API:** `java/src/main/java/org/rocksdb/RocksDB.java` + +\`\`\`java +// Basic version +public void yourNewAPI() throws RocksDBException { + yourNewAPI(null); +} + +// Column family version +public void yourNewAPI(ColumnFamilyHandle columnFamilyHandle) + throws RocksDBException { + yourNewAPI(nativeHandle_, null, -1, null, -1, 0, + columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); +} + +// Range version +public void yourNewAPI(final byte[] begin, final byte[] end) + throws RocksDBException { + yourNewAPI(null, begin, end); +} + +// Full-featured version with options +public void yourNewAPI(ColumnFamilyHandle columnFamilyHandle, + final byte[] begin, final byte[] end, + final YourAPIOptions options) + throws RocksDBException { + yourNewAPI(nativeHandle_, + begin, begin == null ? -1 : begin.length, + end, end == null ? -1 : end.length, + options.nativeHandle_, + columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); +} + +// Native method declaration +private static native void yourNewAPI(final long handle, + /* @Nullable */ final byte[] begin, final int beginLen, + /* @Nullable */ final byte[] end, final int endLen, + final long optionsHandle, + final long cfHandle); +\`\`\` + +**Options Class:** `java/src/main/java/org/rocksdb/YourAPIOptions.java` + +\`\`\`java +public class YourAPIOptions extends RocksObject { + + public YourAPIOptions() { + super(newYourAPIOptions()); + } + + // Builder pattern setters + public YourAPIOptions setSomeBooleanOption(boolean value) { + setSomeBooleanOption(nativeHandle_, value); + return this; + } + + // Getters + public boolean someBooleanOption() { + return someBooleanOption(nativeHandle_); + } + + // Native method declarations + private static native long newYourAPIOptions(); + private static native void disposeInternalJni(long handle); + private static native void setSomeBooleanOption(long handle, boolean value); + private static native boolean someBooleanOption(long handle); + + @Override + protected final void disposeInternal(final long handle) { + disposeInternalJni(handle); + } +} +\`\`\` + +**JNI Implementation:** `java/rocksjni/rocksjni.cc` + +\`\`\`cpp +void Java_org_rocksdb_RocksDB_yourNewAPI( + JNIEnv* env, jclass, + jlong jdb_handle, jbyteArray jbegin, jint jbegin_len, + jbyteArray jend, jint jend_len, + jlong joptions_handle, jlong jcf_handle) { + + // 1. Convert Java byte arrays to C++ strings + jboolean has_exception = JNI_FALSE; + std::string str_begin; + if (jbegin_len > 0) { + str_begin = ROCKSDB_NAMESPACE::JniUtil::byteString( + env, jbegin, jbegin_len, + [](const char* str, const size_t len) { return std::string(str, len); }, + &has_exception); + if (has_exception == JNI_TRUE) return; + } + + std::string str_end; + if (jend_len > 0) { + str_end = ROCKSDB_NAMESPACE::JniUtil::byteString( + env, jend, jend_len, + [](const char* str, const size_t len) { return std::string(str, len); }, + &has_exception); + if (has_exception == JNI_TRUE) return; + } + + // 2. Get or create options + ROCKSDB_NAMESPACE::YourAPIOptions* options = nullptr; + if (joptions_handle == 0) { + options = new ROCKSDB_NAMESPACE::YourAPIOptions(); + } else { + options = reinterpret_cast(joptions_handle); + } + + // 3. Unwrap handles + auto* db = reinterpret_cast(jdb_handle); + ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle = + jcf_handle == 0 ? db->DefaultColumnFamily() + : reinterpret_cast(jcf_handle); + + // 4. Create Slices + std::unique_ptr begin; + std::unique_ptr end; + if (jbegin_len > 0) begin.reset(new ROCKSDB_NAMESPACE::Slice(str_begin)); + if (jend_len > 0) end.reset(new ROCKSDB_NAMESPACE::Slice(str_end)); + + // 5. Call C++ API + ROCKSDB_NAMESPACE::Status s = db->YourNewAPI(*options, cf_handle, begin.get(), end.get()); + + // 6. Cleanup if we created options + if (joptions_handle == 0) delete options; + + // 7. Throw Java exception on error + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); +} +\`\`\` + +**Options JNI:** `java/rocksjni/your_api_options.cc` + +\`\`\`cpp +jlong Java_org_rocksdb_YourAPIOptions_newYourAPIOptions(JNIEnv*, jclass) { + auto* options = new ROCKSDB_NAMESPACE::YourAPIOptions(); + return GET_CPLUSPLUS_POINTER(options); +} + +void Java_org_rocksdb_YourAPIOptions_disposeInternalJni(JNIEnv*, jclass, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + delete options; +} + +void Java_org_rocksdb_YourAPIOptions_setSomeBooleanOption( + JNIEnv*, jclass, jlong jhandle, jboolean value) { + auto* options = reinterpret_cast(jhandle); + options->some_boolean_option = static_cast(value); +} + +jboolean Java_org_rocksdb_YourAPIOptions_someBooleanOption(JNIEnv*, jclass, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + return static_cast(options->some_boolean_option); +} +\`\`\` + +### Step 7: Update Build Files + +**Java CMakeLists.txt:** `java/CMakeLists.txt` + +Add your new Java source files: +\`\`\`cmake +src/main/java/org/rocksdb/YourAPIOptions.java +src/test/java/org/rocksdb/YourAPIOptionsTest.java +\`\`\` + +### Step 8: Add Release Notes + +**Directory:** `unreleased_history/` + +RocksDB uses individual files in the `unreleased_history/` directory rather than directly editing `HISTORY.md`. This avoids merge conflicts and ensures changes are attributed to the correct release version. + +Add a file to the appropriate subdirectory: +- `unreleased_history/new_features/` - For new functionality +- `unreleased_history/public_api_changes/` - For API changes +- `unreleased_history/behavior_changes/` - For behavior modifications +- `unreleased_history/bug_fixes/` - For bug fixes + +**Example:** `unreleased_history/new_features/your_new_api.md` + +\`\`\`markdown +Added `YourNewAPI()` to support [describe functionality]. See `YourAPIOptions` for configuration. +\`\`\` + +**Example:** `unreleased_history/public_api_changes/your_api_options.md` + +**Note:** Files should contain one line of markdown. The "* " prefix is automatically added if not included. These files are compiled into `HISTORY.md` during the release process. + +### Step 9: Add Tests + +**C++ Unit Tests:** `db/db_your_api_test.cc` or add to existing test file + +\`\`\`cpp +TEST_F(DBTest, YourNewAPIBasic) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + // Setup test data + ASSERT_OK(Put(1, "key1", "value1")); + ASSERT_OK(Put(1, "key2", "value2")); + + // Test your API + YourAPIOptions api_options; + api_options.some_boolean_option = true; + ASSERT_OK(db_->YourNewAPI(api_options, handles_[1], nullptr, nullptr)); + + // Verify results + // ... +} +\`\`\` + +**Java Tests:** `java/src/test/java/org/rocksdb/YourAPIOptionsTest.java` + +\`\`\`java +public class YourAPIOptionsTest { + @Test + public void yourAPIOptions() { + try (final YourAPIOptions options = new YourAPIOptions()) { + assertFalse(options.someBooleanOption()); + options.setSomeBooleanOption(true); + assertTrue(options.someBooleanOption()); + } + } +} +\`\`\` + +## File Summary Checklist + + +| Component | File(s) | Required | +|-----------|---------|----------| +| C++ Public Interface | `include/rocksdb/db.h` | ✓ | +| Options Struct | `include/rocksdb/options.h` | If needed | +| DBImpl Declaration | `db/db_impl/db_impl.h` | ✓ | +| DBImpl Implementation | `db/db_impl/db_impl_*.cc` | ✓ | +| StackableDB | `include/rocksdb/utilities/stackable_db.h` | ✓ | +| Secondary DB | `db/db_impl/db_impl_secondary.h` | If not supported | +| Compacted DB | `db/db_impl/compacted_db_impl.h` | If not supported | +| C API Header | `include/rocksdb/c.h` | ✓ | +| C API Implementation | `db/c.cc` | ✓ | +| Java API | `java/src/main/java/org/rocksdb/RocksDB.java` | ✓ | +| Java Options | `java/src/main/java/org/rocksdb/YourAPIOptions.java` | If needed | +| JNI Implementation | `java/rocksjni/rocksjni.cc` | ✓ | +| JNI Options | `java/rocksjni/your_api_options.cc` | If needed | +| Java CMake | `java/CMakeLists.txt` | If new files | +| Changelog | `unreleased_history/*.md` | ✓ | +| C++ Tests | `db/db_*_test.cc` | ✓ | +| Java Tests | `java/src/test/java/org/rocksdb/*Test.java` | ✓ | + +## Best Practices + +1. **Error Handling**: Always return `Status` objects in C++, throw exceptions in Java +2. **Default Values**: Provide sensible defaults for all options +3. **Documentation**: Add clear comments for all public methods and options +4. **Column Family Support**: Always support column family operations +5. **Thread Safety**: Document thread-safety guarantees +6. **Backward Compatibility**: Avoid breaking existing API contracts +7. **Testing**: Add comprehensive unit tests for all code paths diff --git a/crash_test.mk b/crash_test.mk index a71a55c15c73..02e15a862aae 100644 --- a/crash_test.mk +++ b/crash_test.mk @@ -8,21 +8,33 @@ DB_STRESS_CMD?=./db_stress include common.mk CRASHTEST_MAKE=$(MAKE) -f crash_test.mk -CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)' +CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)' --destroy_db_initially=1 .PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \ + crash_test_with_wc_txn crash_test_with_wp_txn crash_test_with_wup_txn \ crash_test_with_best_efforts_recovery crash_test_with_ts \ + crash_test_with_multiops_wc_txn \ + crash_test_with_multiops_wp_txn \ + crash_test_with_multiops_wup_txn \ + crash_test_with_optimistic_txn \ + crash_test_with_tiered_storage \ blackbox_crash_test blackbox_crash_test_with_atomic_flush \ + blackbox_crash_test_with_wc_txn blackbox_crash_test_with_wp_txn \ + blackbox_crash_test_with_wup_txn \ blackbox_crash_test_with_txn blackbox_crash_test_with_ts \ blackbox_crash_test_with_best_efforts_recovery \ - whitebox_crash_test whitebox_crash_test_with_atomic_flush \ - whitebox_crash_test_with_txn whitebox_crash_test_with_ts \ blackbox_crash_test_with_multiops_wc_txn \ blackbox_crash_test_with_multiops_wp_txn \ - crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \ - whitebox_crash_test_with_tiered_storage \ - whitebox_crash_test_with_optimistic_txn \ + blackbox_crash_test_with_multiops_wup_txn \ blackbox_crash_test_with_optimistic_txn \ + blackbox_crash_test_with_tiered_storage \ + whitebox_crash_test whitebox_crash_test_with_atomic_flush \ + whitebox_crash_test_with_wc_txn whitebox_crash_test_with_wp_txn \ + whitebox_crash_test_with_wup_txn \ + whitebox_crash_test_with_txn whitebox_crash_test_with_ts \ + whitebox_crash_test_with_optimistic_txn \ + whitebox_crash_test_with_tiered_storage \ + crash_test_db_cleanup \ crash_test: $(DB_STRESS_CMD) # Do not parallelize @@ -34,10 +46,20 @@ crash_test_with_atomic_flush: $(DB_STRESS_CMD) $(CRASHTEST_MAKE) whitebox_crash_test_with_atomic_flush $(CRASHTEST_MAKE) blackbox_crash_test_with_atomic_flush -crash_test_with_txn: $(DB_STRESS_CMD) +crash_test_with_wc_txn: $(DB_STRESS_CMD) # Do not parallelize - $(CRASHTEST_MAKE) whitebox_crash_test_with_txn - $(CRASHTEST_MAKE) blackbox_crash_test_with_txn + $(CRASHTEST_MAKE) whitebox_crash_test_with_wc_txn + $(CRASHTEST_MAKE) blackbox_crash_test_with_wc_txn + +crash_test_with_wp_txn: $(DB_STRESS_CMD) +# Do not parallelize + $(CRASHTEST_MAKE) whitebox_crash_test_with_wp_txn + $(CRASHTEST_MAKE) blackbox_crash_test_with_wp_txn + +crash_test_with_wup_txn: $(DB_STRESS_CMD) +# Do not parallelize + $(CRASHTEST_MAKE) whitebox_crash_test_with_wup_txn + $(CRASHTEST_MAKE) blackbox_crash_test_with_wup_txn crash_test_with_optimistic_txn: $(DB_STRESS_CMD) # Do not parallelize @@ -62,6 +84,9 @@ crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD) crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD) $(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wp_txn +crash_test_with_multiops_wup_txn: $(DB_STRESS_CMD) + $(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wup_txn + blackbox_crash_test: $(DB_STRESS_CMD) $(CRASHTEST_PY) --simple blackbox $(CRASH_TEST_EXT_ARGS) $(CRASHTEST_PY) blackbox $(CRASH_TEST_EXT_ARGS) @@ -69,8 +94,14 @@ blackbox_crash_test: $(DB_STRESS_CMD) blackbox_crash_test_with_atomic_flush: $(DB_STRESS_CMD) $(CRASHTEST_PY) --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS) -blackbox_crash_test_with_txn: $(DB_STRESS_CMD) - $(CRASHTEST_PY) --txn blackbox $(CRASH_TEST_EXT_ARGS) +blackbox_crash_test_with_wc_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --txn blackbox --txn_write_policy 0 $(CRASH_TEST_EXT_ARGS) + +blackbox_crash_test_with_wp_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --txn blackbox --txn_write_policy 1 $(CRASH_TEST_EXT_ARGS) + +blackbox_crash_test_with_wup_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --txn blackbox --txn_write_policy 2 $(CRASH_TEST_EXT_ARGS) blackbox_crash_test_with_best_efforts_recovery: $(DB_STRESS_CMD) $(CRASHTEST_PY) --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS) @@ -79,10 +110,13 @@ blackbox_crash_test_with_ts: $(DB_STRESS_CMD) $(CRASHTEST_PY) --enable_ts blackbox $(CRASH_TEST_EXT_ARGS) blackbox_crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD) - $(CRASHTEST_PY) --test_multiops_txn --write_policy write_committed blackbox $(CRASH_TEST_EXT_ARGS) + $(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 0 blackbox $(CRASH_TEST_EXT_ARGS) blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD) - $(CRASHTEST_PY) --test_multiops_txn --write_policy write_prepared blackbox $(CRASH_TEST_EXT_ARGS) + $(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 1 blackbox $(CRASH_TEST_EXT_ARGS) + +blackbox_crash_test_with_multiops_wup_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 2 blackbox $(CRASH_TEST_EXT_ARGS) blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD) $(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS) @@ -104,9 +138,17 @@ whitebox_crash_test_with_atomic_flush: $(DB_STRESS_CMD) $(CRASHTEST_PY) --cf_consistency whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) -whitebox_crash_test_with_txn: $(DB_STRESS_CMD) - $(CRASHTEST_PY) --txn whitebox --random_kill_odd \ - $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) +whitebox_crash_test_with_wc_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --txn whitebox --txn_write_policy 0 \ + --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) + +whitebox_crash_test_with_wp_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --txn whitebox --txn_write_policy 1 \ + --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) + +whitebox_crash_test_with_wup_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --txn whitebox --txn_write_policy 2 \ + --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) whitebox_crash_test_with_ts: $(DB_STRESS_CMD) $(CRASHTEST_PY) --enable_ts whitebox --random_kill_odd \ @@ -119,3 +161,11 @@ whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD) whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD) $(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) + +crash_test_db_cleanup: $(DB_STRESS_CMD) + $(DB_STRESS_CMD) --delete_dir_and_exit=$(TEST_TMPDIR) + +# Old names DEPRECATED +crash_test_with_txn: crash_test_with_wc_txn +whitebox_crash_test_with_txn: whitebox_crash_test_with_wc_txn +blackbox_crash_test_with_txn: blackbox_crash_test_with_wc_txn diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc index 21fb15504061..96441d5d303e 100644 --- a/db/arena_wrapped_db_iter.cc +++ b/db/arena_wrapped_db_iter.cc @@ -42,9 +42,9 @@ Status ArenaWrappedDBIter::GetProperty(std::string prop_name, void ArenaWrappedDBIter::Init( Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Version* version, - const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, - uint64_t version_number, ReadCallback* read_callback, - ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) { + const SequenceNumber& sequence, uint64_t version_number, + ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, + bool expose_blob_index, bool allow_refresh, ReadOnlyMemTable* active_mem) { read_options_ = read_options; if (!CheckFSFeatureSupport(env->GetFileSystem().get(), FSSupportedOps::kAsyncIO)) { @@ -52,15 +52,14 @@ void ArenaWrappedDBIter::Init( } read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only; - auto mem = arena_.AllocateAligned(sizeof(DBIter)); - db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options, - ioptions.user_comparator, - /* iter */ nullptr, version, sequence, true, - max_sequential_skip_in_iteration, read_callback, - cfh, expose_blob_index); + db_iter_ = DBIter::NewIter( + env, read_options_, ioptions, mutable_cf_options, + ioptions.user_comparator, /*internal_iter=*/nullptr, version, sequence, + read_callback, active_mem, cfh, expose_blob_index, &arena_); sv_number_ = version_number; allow_refresh_ = allow_refresh; + allow_mark_memtable_for_flush_ = active_mem; memtable_range_tombstone_iter_ = nullptr; } @@ -166,9 +165,8 @@ void ArenaWrappedDBIter::DoRefresh(const Snapshot* snapshot, read_callback_->Refresh(read_seq); } Init(env, read_options_, cfd->ioptions(), sv->mutable_cf_options, sv->current, - read_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_callback_, cfh_, expose_blob_index_, - allow_refresh_); + read_seq, sv->version_number, read_callback_, cfh_, expose_blob_index_, + allow_refresh_, allow_mark_memtable_for_flush_ ? sv->mem : nullptr); InternalIterator* internal_iter = db_impl->NewInternalIterator( read_options_, cfd, sv, &arena_, read_seq, @@ -253,20 +251,26 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) { } ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, - const MutableCFOptions& mutable_cf_options, const Version* version, - const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - uint64_t version_number, ReadCallback* read_callback, - ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) { - ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); - iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence, - max_sequential_skip_in_iterations, version_number, read_callback, - cfh, expose_blob_index, allow_refresh); + Env* env, const ReadOptions& read_options, ColumnFamilyHandleImpl* cfh, + SuperVersion* sv, const SequenceNumber& sequence, + ReadCallback* read_callback, DBImpl* db_impl, bool expose_blob_index, + bool allow_refresh, bool allow_mark_memtable_for_flush) { + ArenaWrappedDBIter* db_iter = new ArenaWrappedDBIter(); + db_iter->Init(env, read_options, cfh->cfd()->ioptions(), + sv->mutable_cf_options, sv->current, sequence, + sv->version_number, read_callback, cfh, expose_blob_index, + allow_refresh, + allow_mark_memtable_for_flush ? sv->mem : nullptr); if (cfh != nullptr && allow_refresh) { - iter->StoreRefreshInfo(cfh, read_callback, expose_blob_index); + db_iter->StoreRefreshInfo(cfh, read_callback, expose_blob_index); } - return iter; + InternalIterator* internal_iter = db_impl->NewInternalIterator( + db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), sequence, + /*allow_unprepared_value=*/true, db_iter); + db_iter->SetIterUnderDBIter(internal_iter); + + return db_iter; } } // namespace ROCKSDB_NAMESPACE diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h index 801988bfca7b..26062497a0b7 100644 --- a/db/arena_wrapped_db_iter.h +++ b/db/arena_wrapped_db_iter.h @@ -19,7 +19,6 @@ #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" -#include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -99,13 +98,19 @@ class ArenaWrappedDBIter : public Iterator { bool PrepareValue() override { return db_iter_->PrepareValue(); } + void Prepare(const MultiScanArgs& scan_opts) override { + db_iter_->Prepare(scan_opts); + } + + // FIXME: we could just pass SV in for mutable cf option, version and version + // number, but this is used by SstFileReader which does not have a SV. void Init(Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Version* version, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + const SequenceNumber& sequence, uint64_t version_number, ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, - bool expose_blob_index, bool allow_refresh); + bool expose_blob_index, bool allow_refresh, + ReadOnlyMemTable* active_mem); // Store some parameters so we can refresh the iterator at a later point // with these same params @@ -128,20 +133,16 @@ class ArenaWrappedDBIter : public Iterator { ReadCallback* read_callback_; bool expose_blob_index_ = false; bool allow_refresh_ = true; + bool allow_mark_memtable_for_flush_ = true; // If this is nullptr, it means the mutable memtable does not contain range // tombstone when added under this DBIter. std::unique_ptr* memtable_range_tombstone_iter_ = nullptr; }; -// Generate the arena wrapped iterator class. -// `cfh` is used for reneweal. If left null, renewal will not -// be supported. ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, - const MutableCFOptions& mutable_cf_options, const Version* version, - const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - uint64_t version_number, ReadCallback* read_callback, - ColumnFamilyHandleImpl* cfh = nullptr, bool expose_blob_index = false, - bool allow_refresh = true); + Env* env, const ReadOptions& read_options, ColumnFamilyHandleImpl* cfh, + SuperVersion* sv, const SequenceNumber& sequence, + ReadCallback* read_callback, DBImpl* db_impl, bool expose_blob_index, + bool allow_refresh, bool allow_mark_memtable_for_flush); } // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc index dceb90cee57a..5e71c8a38236 100644 --- a/db/blob/blob_file_builder.cc +++ b/db/blob/blob_file_builder.cc @@ -67,6 +67,16 @@ BlobFileBuilder::BlobFileBuilder( min_blob_size_(mutable_cf_options->min_blob_size), blob_file_size_(mutable_cf_options->blob_file_size), blob_compression_type_(mutable_cf_options->blob_compression_type), + // TODO: support most CompressionOptions with a new CF option + // blob_compression_opts + // TODO with schema change: support custom compression manager and options + // such as max_compressed_bytes_per_kb + // NOTE: returns nullptr for kNoCompression + blob_compressor_(GetBuiltinV2CompressionManager()->GetCompressor( + CompressionOptions{}, blob_compression_type_)), + blob_compressor_wa_(blob_compressor_ + ? blob_compressor_->ObtainWorkingArea() + : Compressor::ManagedWorkingArea{}), prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache), file_options_(file_options), write_options_(write_options), @@ -113,7 +123,7 @@ Status BlobFileBuilder::Add(const Slice& key, const Slice& value, } Slice blob = value; - std::string compressed_blob; + GrowableBuffer compressed_blob; { const Status s = CompressBlobIfNeeded(&blob, &compressed_blob); @@ -188,10 +198,12 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { } std::unique_ptr file; - + FileOptions fo_copy; { assert(file_options_); - Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_); + fo_copy = *file_options_; + fo_copy.write_hint = write_hint_; + Status s = NewWritableFile(fs_, blob_file_path, &file, fo_copy); TEST_SYNC_POINT_CALLBACK( "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s); @@ -209,7 +221,9 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { assert(file); file->SetIOPriority(write_options_->rate_limiter_priority); - file->SetWriteLifeTimeHint(write_hint_); + // Subsequent attempts to override the hint via SetWriteLifeTimeHint + // with the very same value will be ignored by the fs. + file->SetWriteLifeTimeHint(fo_copy.write_hint); FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types; Statistics* const statistics = immutable_options_->stats; std::unique_ptr file_writer(new WritableFileWriter( @@ -250,37 +264,27 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { } Status BlobFileBuilder::CompressBlobIfNeeded( - Slice* blob, std::string* compressed_blob) const { + Slice* blob, GrowableBuffer* compressed_blob) const { assert(blob); assert(compressed_blob); assert(compressed_blob->empty()); assert(immutable_options_); - if (blob_compression_type_ == kNoCompression) { + if (!blob_compressor_) { + assert(blob_compression_type_ == kNoCompression); return Status::OK(); } + assert(blob_compression_type_ != kNoCompression); - // TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb - CompressionOptions opts; - CompressionContext context(blob_compression_type_, opts); - constexpr uint64_t sample_for_compression = 0; - - CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), - blob_compression_type_, sample_for_compression); - - constexpr uint32_t compression_format_version = 2; + // WART: always stored as compressed even when that increases the size. - bool success = false; - - { - StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats, - BLOB_DB_COMPRESSION_MICROS); - success = - CompressData(*blob, info, compression_format_version, compressed_blob); - } - - if (!success) { - return Status::Corruption("Error compressing blob"); + Status s; + StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats, + BLOB_DB_COMPRESSION_MICROS); + s = LegacyForceBuiltinCompression(*blob_compressor_, &blob_compressor_wa_, + *blob, compressed_blob); + if (!s.ok()) { + return s; } *blob = Slice(*compressed_blob); diff --git a/db/blob/blob_file_builder.h b/db/blob/blob_file_builder.h index 6ba7181aa09f..95d55f6bd9b6 100644 --- a/db/blob/blob_file_builder.h +++ b/db/blob/blob_file_builder.h @@ -10,12 +10,14 @@ #include #include +#include "rocksdb/advanced_compression.h" #include "rocksdb/advanced_options.h" #include "rocksdb/compression_type.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/types.h" +#include "util/aligned_buffer.h" namespace ROCKSDB_NAMESPACE { @@ -76,7 +78,8 @@ class BlobFileBuilder { private: bool IsBlobFileOpen() const; Status OpenBlobFileIfNeeded(); - Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const; + Status CompressBlobIfNeeded(Slice* blob, + GrowableBuffer* compressed_blob) const; Status WriteBlobToFile(const Slice& key, const Slice& blob, uint64_t* blob_file_number, uint64_t* blob_offset); Status CloseBlobFile(); @@ -91,6 +94,8 @@ class BlobFileBuilder { uint64_t min_blob_size_; uint64_t blob_file_size_; CompressionType blob_compression_type_; + std::unique_ptr blob_compressor_; + mutable Compressor::ManagedWorkingArea blob_compressor_wa_; PrepopulateBlobCache prepopulate_blob_cache_; const FileOptions* file_options_; const WriteOptions* write_options_; diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc index 8a2ecff13a74..ad09238e2f4f 100644 --- a/db/blob/blob_file_builder_test.cc +++ b/db/blob/blob_file_builder_test.cc @@ -403,23 +403,19 @@ TEST_F(BlobFileBuilderTest, Compression) { ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); - CompressionOptions opts; - CompressionContext context(kSnappyCompression, opts); - constexpr uint64_t sample_for_compression = 0; - - CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), - kSnappyCompression, sample_for_compression); - - std::string compressed_value; - ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(), - uncompressed_value.size(), &compressed_value)); + auto compressor = + GetBuiltinV2CompressionManager()->GetCompressor({}, kSnappyCompression); + GrowableBuffer compressed_value; + ASSERT_OK(LegacyForceBuiltinCompression(*compressor, /*working_area=*/nullptr, + uncompressed_value, + &compressed_value)); ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), BlobLogRecord::kHeaderSize + key_size + compressed_value.size()); // Verify the contents of the new blob file as well as the blob reference std::vector> expected_key_value_pairs{ - {key, compressed_value}}; + {key, compressed_value.AsSlice().ToString()}}; std::vector blob_indexes{blob_index}; VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, @@ -458,11 +454,12 @@ TEST_F(BlobFileBuilderTest, CompressionError) { nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); - SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue", - [](void* arg) { - bool* ret = static_cast(arg); - *ret = false; - }); + SyncPoint::GetInstance()->SetCallBack( + "LegacyForceBuiltinCompression:TamperWithStatus", [](void* arg) { + Status* ret = static_cast(arg); + ASSERT_OK(*ret); + *ret = Status::Corruption("Tampered result"); + }); SyncPoint::GetInstance()->EnableProcessing(); constexpr char key[] = "1"; @@ -470,7 +467,7 @@ TEST_F(BlobFileBuilderTest, CompressionError) { std::string blob_index; - ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption()); + ASSERT_EQ(builder.Add(key, value, &blob_index).code(), Status::kCorruption); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); diff --git a/db/blob/blob_file_meta.h b/db/blob/blob_file_meta.h index d7c8a124336d..2e47726f8d11 100644 --- a/db/blob/blob_file_meta.h +++ b/db/blob/blob_file_meta.h @@ -6,6 +6,7 @@ #pragma once #include +#include #include #include #include diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc index 0c30efbc119f..3f419c5a0814 100644 --- a/db/blob/blob_file_reader.cc +++ b/db/blob/blob_file_reader.cc @@ -17,10 +17,10 @@ #include "rocksdb/file_system.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" +#include "table/format.h" #include "table/multiget_context.h" #include "test_util/sync_point.h" #include "util/compression.h" -#include "util/crc32c.h" #include "util/stop_watch.h" namespace ROCKSDB_NAMESPACE { @@ -69,9 +69,16 @@ Status BlobFileReader::Create( } } - blob_file_reader->reset( - new BlobFileReader(std::move(file_reader), file_size, compression_type, - immutable_options.clock, statistics)); + std::shared_ptr decompressor; + if (compression_type != kNoCompression) { + // The blob format has always used compression format 2 + decompressor = GetBuiltinV2CompressionManager()->GetDecompressorOptimizeFor( + compression_type); + } + + blob_file_reader->reset(new BlobFileReader( + std::move(file_reader), file_size, compression_type, + std::move(decompressor), immutable_options.clock, statistics)); return Status::OK(); } @@ -250,7 +257,8 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, Status s; IOOptions io_options; - s = file_reader->PrepareIOOptions(read_options, io_options); + IODebugContext dbg; + s = file_reader->PrepareIOOptions(read_options, io_options, &dbg); if (!s.ok()) { return s; } @@ -259,13 +267,13 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, constexpr char* scratch = nullptr; s = file_reader->Read(io_options, read_offset, read_size, slice, scratch, - aligned_buf); + aligned_buf, &dbg); } else { buf->reset(new char[read_size]); constexpr AlignedBuf* aligned_scratch = nullptr; s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(), - aligned_scratch); + aligned_scratch, &dbg); } if (!s.ok()) { @@ -281,11 +289,13 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, BlobFileReader::BlobFileReader( std::unique_ptr&& file_reader, uint64_t file_size, - CompressionType compression_type, SystemClock* clock, + CompressionType compression_type, + std::shared_ptr decompressor, SystemClock* clock, Statistics* statistics) : file_reader_(std::move(file_reader)), file_size_(file_size), compression_type_(compression_type), + decompressor_(std::move(decompressor)), clock_(clock), statistics_(statistics) { assert(file_reader_); @@ -334,7 +344,8 @@ Status BlobFileReader::GetBlob( constexpr bool for_compaction = true; IOOptions io_options; - s = file_reader_->PrepareIOOptions(read_options, io_options); + IODebugContext dbg; + s = file_reader_->PrepareIOOptions(read_options, io_options, &dbg); if (!s.ok()) { return s; } @@ -373,8 +384,9 @@ Status BlobFileReader::GetBlob( const Slice value_slice(record_slice.data() + adjustment, value_size); { - const Status s = UncompressBlobIfNeeded( - value_slice, compression_type, allocator, clock_, statistics_, result); + const Status s = UncompressBlobIfNeeded(value_slice, compression_type, + decompressor_.get(), allocator, + clock_, statistics_, result); if (!s.ok()) { return s; } @@ -463,10 +475,11 @@ void BlobFileReader::MultiGetBlob( PERF_COUNTER_ADD(blob_read_count, num_blobs); PERF_COUNTER_ADD(blob_read_byte, total_len); IOOptions opts; - s = file_reader_->PrepareIOOptions(read_options, opts); + IODebugContext dbg; + s = file_reader_->PrepareIOOptions(read_options, opts, &dbg); if (s.ok()) { s = file_reader_->MultiRead(opts, read_reqs.data(), read_reqs.size(), - direct_io ? &aligned_buf : nullptr); + direct_io ? &aligned_buf : nullptr, &dbg); } if (!s.ok()) { for (auto& req : read_reqs) { @@ -521,9 +534,9 @@ void BlobFileReader::MultiGetBlob( // Uncompress blob if needed Slice value_slice(record_slice.data() + adjustments[i], req->len); - *req->status = - UncompressBlobIfNeeded(value_slice, compression_type_, allocator, - clock_, statistics_, &blob_reqs[i].second); + *req->status = UncompressBlobIfNeeded( + value_slice, compression_type_, decompressor_.get(), allocator, clock_, + statistics_, &blob_reqs[i].second); if (req->status->ok()) { total_bytes += record_slice.size(); } @@ -580,8 +593,8 @@ Status BlobFileReader::VerifyBlob(const Slice& record_slice, Status BlobFileReader::UncompressBlobIfNeeded( const Slice& value_slice, CompressionType compression_type, - MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics, - std::unique_ptr* result) { + Decompressor* decompressor, MemoryAllocator* allocator, SystemClock* clock, + Statistics* statistics, std::unique_ptr* result) { assert(result); if (compression_type == kNoCompression) { @@ -590,31 +603,33 @@ Status BlobFileReader::UncompressBlobIfNeeded( return Status::OK(); } - UncompressionContext context(compression_type); - UncompressionInfo info(context, UncompressionDict::GetEmptyDict(), - compression_type); + assert(decompressor); + + Decompressor::Args args; + args.compression_type = compression_type; + args.compressed_data = value_slice; - size_t uncompressed_size = 0; - constexpr uint32_t compression_format_version = 2; + Status s = decompressor->ExtractUncompressedSize(args); + if (!s.ok()) { + return Status::Corruption(s.ToString()); + } - CacheAllocationPtr output; + CacheAllocationPtr output = AllocateBlock(args.uncompressed_size, allocator); { PERF_TIMER_GUARD(blob_decompress_time); StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS); - output = UncompressData(info, value_slice.data(), value_slice.size(), - &uncompressed_size, compression_format_version, - allocator); + s = decompressor->DecompressBlock(args, output.get()); } TEST_SYNC_POINT_CALLBACK( - "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output); + "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &s); - if (!output) { - return Status::Corruption("Unable to uncompress blob"); + if (!s.ok()) { + return Status::Corruption(s.ToString()); } - result->reset(new BlobContents(std::move(output), uncompressed_size)); + result->reset(new BlobContents(std::move(output), args.uncompressed_size)); return Status::OK(); } diff --git a/db/blob/blob_file_reader.h b/db/blob/blob_file_reader.h index fa8aa501d45f..e13e3380302a 100644 --- a/db/blob/blob_file_reader.h +++ b/db/blob/blob_file_reader.h @@ -10,6 +10,7 @@ #include "db/blob/blob_read_request.h" #include "file/random_access_file_reader.h" +#include "rocksdb/advanced_compression.h" #include "rocksdb/compression_type.h" #include "rocksdb/rocksdb_namespace.h" #include "util/autovector.h" @@ -64,7 +65,8 @@ class BlobFileReader { private: BlobFileReader(std::unique_ptr&& file_reader, uint64_t file_size, CompressionType compression_type, - SystemClock* clock, Statistics* statistics); + std::shared_ptr decompressor, SystemClock* clock, + Statistics* statistics); static Status OpenFile(const ImmutableOptions& immutable_options, const FileOptions& file_opts, @@ -96,6 +98,7 @@ class BlobFileReader { static Status UncompressBlobIfNeeded(const Slice& value_slice, CompressionType compression_type, + Decompressor* decompressor, MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics, @@ -104,6 +107,7 @@ class BlobFileReader { std::unique_ptr file_reader_; uint64_t file_size_; CompressionType compression_type_; + std::shared_ptr decompressor_; SystemClock* clock_; Statistics* statistics_; }; diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc index 676cbed41e85..0e98d2619b02 100644 --- a/db/blob/blob_file_reader_test.cc +++ b/db/blob/blob_file_reader_test.cc @@ -65,7 +65,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); - std::vector compressed_blobs(num); + std::vector compressed_blobs(num); std::vector blobs_to_write(num); if (kNoCompression == compression) { for (size_t i = 0; i < num; ++i) { @@ -73,17 +73,13 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, blob_sizes[i] = blobs[i].size(); } } else { - CompressionOptions opts; - CompressionContext context(compression, opts); - constexpr uint64_t sample_for_compression = 0; - CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), - compression, sample_for_compression); - - constexpr uint32_t compression_format_version = 2; + auto compressor = + GetBuiltinV2CompressionManager()->GetCompressor({}, compression); for (size_t i = 0; i < num; ++i) { - ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version, - &compressed_blobs[i])); + ASSERT_OK(LegacyForceBuiltinCompression(*compressor, + /*working_area=*/nullptr, + blobs[i], &compressed_blobs[i])); blobs_to_write[i] = compressed_blobs[i]; blob_sizes[i] = compressed_blobs[i].size(); } @@ -810,11 +806,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) { SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) { - CacheAllocationPtr* const output = - static_cast(arg); - assert(output); + auto* result = static_cast(arg); + assert(result); - output->reset(); + *result = Status::Corruption("Injected result"); }); SyncPoint::GetInstance()->EnableProcessing(); @@ -825,11 +820,12 @@ TEST_F(BlobFileReaderTest, UncompressionError) { std::unique_ptr value; uint64_t bytes_read = 0; - ASSERT_TRUE(reader - ->GetBlob(ReadOptions(), key, blob_offset, blob_size, - kSnappyCompression, prefetch_buffer, allocator, - &value, &bytes_read) - .IsCorruption()); + ASSERT_EQ(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, allocator, + &value, &bytes_read) + .code(), + Status::Code::kCorruption); ASSERT_EQ(value, nullptr); ASSERT_EQ(bytes_read, 0); diff --git a/db/blob/blob_index.h b/db/blob/blob_index.h index e9944d78448b..fda6f946a672 100644 --- a/db/blob/blob_index.h +++ b/db/blob/blob_index.h @@ -137,6 +137,18 @@ class BlobIndex { return oss.str(); } + // Encode this blob index into dst based on its type. + void EncodeTo(std::string* dst) const { + if (IsInlined()) { + EncodeInlinedTTL(dst, expiration_, value_); + } else if (HasTTL()) { + EncodeBlobTTL(dst, expiration_, file_number_, offset_, size_, + compression_); + } else { + EncodeBlob(dst, file_number_, offset_, size_, compression_); + } + } + static void EncodeInlinedTTL(std::string* dst, uint64_t expiration, const Slice& value) { assert(dst != nullptr); diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index d0e9def7d8b8..07c47ee50256 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -67,7 +67,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header)); - std::vector compressed_blobs(num); + std::vector compressed_blobs(num); std::vector blobs_to_write(num); if (kNoCompression == compression) { for (size_t i = 0; i < num; ++i) { @@ -75,17 +75,13 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, blob_sizes[i] = blobs[i].size(); } } else { - CompressionOptions opts; - CompressionContext context(compression, opts); - constexpr uint64_t sample_for_compression = 0; - CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), - compression, sample_for_compression); - - constexpr uint32_t compression_format_version = 2; + auto compressor = + GetBuiltinV2CompressionManager()->GetCompressor({}, compression); for (size_t i = 0; i < num; ++i) { - ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version, - &compressed_blobs[i])); + ASSERT_OK(LegacyForceBuiltinCompression(*compressor, + /*working_area=*/nullptr, + blobs[i], &compressed_blobs[i])); blobs_to_write[i] = compressed_blobs[i]; blob_sizes[i] = compressed_blobs[i].size(); } diff --git a/db/builder.cc b/db/builder.cc index 08a9fecc7278..0ca00a45bd5f 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -56,6 +56,18 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, return tboptions.moptions.table_factory->NewTableBuilder(tboptions, file); } +void ExtractTimestampFromTableProperties(const TableProperties& tp, + FileMetaData* meta) { + auto min_ts_iter = tp.user_collected_properties.find("rocksdb.timestamp_min"); + if (min_ts_iter != tp.user_collected_properties.end()) { + meta->min_timestamp = min_ts_iter->second; + } + auto max_ts_iter = tp.user_collected_properties.find("rocksdb.timestamp_max"); + if (max_ts_iter != tp.user_collected_properties.end()) { + meta->max_timestamp = max_ts_iter->second; + } +} + Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, @@ -74,8 +86,8 @@ Status BuildTable( EventLogger* event_logger, int job_id, TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, const std::string* full_history_ts_low, BlobFileCompletionCallback* blob_callback, Version* version, - uint64_t* num_input_entries, uint64_t* memtable_payload_bytes, - uint64_t* memtable_garbage_bytes) { + uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes, + InternalStats::CompactionStats* flush_stats) { assert((tboptions.column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == tboptions.column_family_name.empty()); @@ -145,7 +157,9 @@ Status BuildTable( bool use_direct_writes = file_options.use_direct_writes; TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes); #endif // !NDEBUG - IOStatus io_s = NewWritableFile(fs, fname, &file, file_options); + FileOptions fo_copy = file_options; + fo_copy.write_hint = write_hint; + IOStatus io_s = NewWritableFile(fs, fname, &file, fo_copy); assert(s.ok()); s = io_s; if (io_status->ok()) { @@ -163,7 +177,9 @@ Status BuildTable( table_file_created = true; FileTypeSet tmp_set = ioptions.checksum_handoff_file_types; file->SetIOPriority(tboptions.write_options.rate_limiter_priority); - file->SetWriteLifeTimeHint(write_hint); + // Subsequent attempts to override the hint via SetWriteLifeTimeHint + // with the very same value will be ignored by the fs. + file->SetWriteLifeTimeHint(fo_copy.write_hint); file_writer.reset(new WritableFileWriter( std::move(file), fname, file_options, ioptions.clock, io_tracer, ioptions.stats, Histograms::SST_WRITE_MICROS, ioptions.listeners, @@ -197,8 +213,7 @@ Status BuildTable( CompactionIterator c_iter( iter, ucmp, &merge, kMaxSequenceNumber, &snapshots, earliest_snapshot, earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, - ShouldReportDetailedTime(env, ioptions.stats), - true /* internal key corruption is not ok */, range_del_agg.get(), + ShouldReportDetailedTime(env, ioptions.stats), range_del_agg.get(), blob_file_builder.get(), ioptions.allow_data_in_errors, ioptions.enforce_single_del_contracts, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, @@ -214,8 +229,7 @@ Status BuildTable( const Slice& key = c_iter.key(); const Slice& value = c_iter.value(); ParsedInternalKey ikey = c_iter.ikey(); - key_after_flush_buf.assign(key.data(), key.size()); - Slice key_after_flush = key_after_flush_buf; + Slice key_after_flush = key; Slice value_after_flush = value; if (ikey.type == kTypeValuePreferredSeqno) { @@ -233,6 +247,7 @@ Status BuildTable( std::min(smallest_preferred_seqno, preferred_seqno); } else { // Cannot get a useful preferred seqno, convert it to a kTypeValue. + key_after_flush_buf.assign(key.data(), key.size()); UpdateInternalKey(&key_after_flush_buf, ikey.sequence, kTypeValue); ikey = ParsedInternalKey(ikey.user_key, ikey.sequence, kTypeValue); key_after_flush = key_after_flush_buf; @@ -249,6 +264,10 @@ Status BuildTable( } builder->Add(key_after_flush, value_after_flush); + if (flush_stats) { + flush_stats->num_output_records++; + } + s = meta->UpdateBoundaries(key_after_flush, value_after_flush, ikey.sequence, ikey.type); if (!s.ok()) { @@ -280,6 +299,9 @@ Status BuildTable( auto tombstone = range_del_it->Tombstone(); std::pair kv = tombstone.Serialize(); builder->Add(kv.first.Encode(), kv.second); + if (flush_stats) { + flush_stats->num_output_records++; + } InternalKey tombstone_end = tombstone.SerializeEndKey(); meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_, tboptions.internal_comparator); @@ -301,9 +323,9 @@ Status BuildTable( TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); const bool empty = builder->IsEmpty(); - if (num_input_entries != nullptr) { + if (flush_stats) { assert(c_iter.HasNumInputEntryScanned()); - *num_input_entries = + flush_stats->num_input_records = c_iter.NumInputEntryScanned() + num_unfragmented_tombstones; } if (!s.ok() || empty) { @@ -330,6 +352,12 @@ Status BuildTable( } if (s.ok() && !empty) { + if (flush_stats) { + flush_stats->bytes_written_pre_comp = builder->PreCompressionSize(); + // Add worker CPU micros here. Caller needs to add CPU micros from + // calling thread. + flush_stats->cpu_micros += builder->GetWorkerCPUMicros(); + } uint64_t file_size = builder->FileSize(); meta->fd.file_size = file_size; meta->tail_size = builder->GetTailSize(); @@ -339,6 +367,7 @@ Status BuildTable( assert(meta->fd.GetFileSize() > 0); tp = builder ->GetTableProperties(); // refresh now that builder is finished + ExtractTimestampFromTableProperties(tp, meta); if (memtable_payload_bytes != nullptr && memtable_garbage_bytes != nullptr) { const CompactionIterationStats& ci_stats = c_iter.iter_stats(); diff --git a/db/builder.h b/db/builder.h index 08dd5fcab001..9f83a6f5dc16 100644 --- a/db/builder.h +++ b/db/builder.h @@ -10,6 +10,7 @@ #include #include +#include "db/internal_stats.h" #include "db/range_tombstone_fragmenter.h" #include "db/seqno_to_time_mapping.h" #include "db/table_properties_collector.h" @@ -34,13 +35,19 @@ class SnapshotChecker; class TableCache; class TableBuilder; class WritableFileWriter; -class InternalStats; class BlobFileCompletionCallback; // Convenience function for NewTableBuilder on the embedded table_factory. TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, WritableFileWriter* file); +// Extract min/max timestamps from table properties and populate FileMetaData. +// This is used by both flush (BuildTable) and compaction (CompactionOutputs) +// to populate timestamp range in FileMetaData from the TimestampTableProperties +// collector output. +void ExtractTimestampFromTableProperties(const TableProperties& tp, + FileMetaData* meta); + // Build a Table file from the contents of *iter. The generated file // will be named according to number specified in meta. On success, the rest of // *meta will be filled with metadata about the generated table. @@ -49,6 +56,7 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, // // @param column_family_name Name of the column family that is also identified // by column_family_id, or empty string if unknown. +// @param flush_stats treat flush as level 0 compaction in internal stats Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, @@ -69,8 +77,8 @@ Status BuildTable( Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, const std::string* full_history_ts_low = nullptr, BlobFileCompletionCallback* blob_callback = nullptr, - Version* version = nullptr, uint64_t* num_input_entries = nullptr, - uint64_t* memtable_payload_bytes = nullptr, - uint64_t* memtable_garbage_bytes = nullptr); + Version* version = nullptr, uint64_t* memtable_payload_bytes = nullptr, + uint64_t* memtable_garbage_bytes = nullptr, + InternalStats::CompactionStats* flush_stats = nullptr); } // namespace ROCKSDB_NAMESPACE diff --git a/db/c.cc b/db/c.cc index b101540ffa1b..6e00a0761cf6 100644 --- a/db/c.cc +++ b/db/c.cc @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -24,12 +25,14 @@ #include "rocksdb/experimental.h" #include "rocksdb/filter_policy.h" #include "rocksdb/iterator.h" +#include "rocksdb/listener.h" #include "rocksdb/memtablerep.h" #include "rocksdb/merge_operator.h" #include "rocksdb/options.h" #include "rocksdb/perf_context.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/sst_file_manager.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" @@ -49,6 +52,7 @@ #include "util/stderr_logger.h" #include "utilities/merge_operators.h" +using ROCKSDB_NAMESPACE::BackgroundErrorReason; using ROCKSDB_NAMESPACE::BackupEngine; using ROCKSDB_NAMESPACE::BackupEngineOptions; using ROCKSDB_NAMESPACE::BackupID; @@ -65,7 +69,14 @@ using ROCKSDB_NAMESPACE::ColumnFamilyMetaData; using ROCKSDB_NAMESPACE::ColumnFamilyOptions; using ROCKSDB_NAMESPACE::CompactionFilter; using ROCKSDB_NAMESPACE::CompactionFilterFactory; +using ROCKSDB_NAMESPACE::CompactionJobInfo; using ROCKSDB_NAMESPACE::CompactionOptionsFIFO; +using ROCKSDB_NAMESPACE::CompactionReason; +using ROCKSDB_NAMESPACE::CompactionService; +using ROCKSDB_NAMESPACE::CompactionServiceJobInfo; +using ROCKSDB_NAMESPACE::CompactionServiceJobStatus; +using ROCKSDB_NAMESPACE::CompactionServiceOptionsOverride; +using ROCKSDB_NAMESPACE::CompactionServiceScheduleResponse; using ROCKSDB_NAMESPACE::CompactRangeOptions; using ROCKSDB_NAMESPACE::Comparator; using ROCKSDB_NAMESPACE::CompressionType; @@ -76,11 +87,18 @@ using ROCKSDB_NAMESPACE::DBOptions; using ROCKSDB_NAMESPACE::DbPath; using ROCKSDB_NAMESPACE::Env; using ROCKSDB_NAMESPACE::EnvOptions; +using ROCKSDB_NAMESPACE::EventListener; +using ROCKSDB_NAMESPACE::ExportImportFilesMetaData; +using ROCKSDB_NAMESPACE::ExternalFileIngestionInfo; +using ROCKSDB_NAMESPACE::FileChecksumGenFactory; using ROCKSDB_NAMESPACE::FileLock; using ROCKSDB_NAMESPACE::FilterPolicy; +using ROCKSDB_NAMESPACE::FlushJobInfo; using ROCKSDB_NAMESPACE::FlushOptions; +using ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory; using ROCKSDB_NAMESPACE::HistogramData; using ROCKSDB_NAMESPACE::HyperClockCacheOptions; +using ROCKSDB_NAMESPACE::ImportColumnFamilyOptions; using ROCKSDB_NAMESPACE::InfoLogLevel; using ROCKSDB_NAMESPACE::IngestExternalFileOptions; using ROCKSDB_NAMESPACE::Iterator; @@ -90,12 +108,15 @@ using ROCKSDB_NAMESPACE::Logger; using ROCKSDB_NAMESPACE::LRUCacheOptions; using ROCKSDB_NAMESPACE::MemoryAllocator; using ROCKSDB_NAMESPACE::MemoryUtil; +using ROCKSDB_NAMESPACE::MemTableInfo; using ROCKSDB_NAMESPACE::MergeOperator; using ROCKSDB_NAMESPACE::NewBloomFilterPolicy; using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory; using ROCKSDB_NAMESPACE::NewGenericRateLimiter; using ROCKSDB_NAMESPACE::NewLRUCache; using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy; +using ROCKSDB_NAMESPACE::NewSstPartitionerFixedPrefixFactory; +using ROCKSDB_NAMESPACE::OpenAndCompactOptions; using ROCKSDB_NAMESPACE::OptimisticTransactionDB; using ROCKSDB_NAMESPACE::OptimisticTransactionOptions; using ROCKSDB_NAMESPACE::Options; @@ -113,10 +134,14 @@ using ROCKSDB_NAMESPACE::Slice; using ROCKSDB_NAMESPACE::SliceParts; using ROCKSDB_NAMESPACE::SliceTransform; using ROCKSDB_NAMESPACE::Snapshot; +using ROCKSDB_NAMESPACE::SstFileManager; using ROCKSDB_NAMESPACE::SstFileMetaData; using ROCKSDB_NAMESPACE::SstFileWriter; +using ROCKSDB_NAMESPACE::SstPartitionerFactory; using ROCKSDB_NAMESPACE::Status; using ROCKSDB_NAMESPACE::StderrLogger; +using ROCKSDB_NAMESPACE::SubcompactionJobInfo; +using ROCKSDB_NAMESPACE::TableFactory; using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory; using ROCKSDB_NAMESPACE::Transaction; using ROCKSDB_NAMESPACE::TransactionDB; @@ -130,6 +155,8 @@ using ROCKSDB_NAMESPACE::WriteBatch; using ROCKSDB_NAMESPACE::WriteBatchWithIndex; using ROCKSDB_NAMESPACE::WriteBufferManager; using ROCKSDB_NAMESPACE::WriteOptions; +using ROCKSDB_NAMESPACE::WriteStallCondition; +using ROCKSDB_NAMESPACE::WriteStallInfo; using std::unordered_set; using std::vector; @@ -139,6 +166,9 @@ extern "C" { struct rocksdb_t { DB* rep; }; +struct rocksdb_status_ptr_t { + Status* rep; +}; struct rocksdb_backup_engine_t { BackupEngine* rep; }; @@ -211,6 +241,15 @@ struct rocksdb_filelock_t { struct rocksdb_logger_t { std::shared_ptr rep; }; +struct rocksdb_file_checksum_gen_factory_t { + std::shared_ptr rep; +}; +struct rocksdb_sst_partitioner_factory_t { + std::shared_ptr rep; +}; +struct rocksdb_table_properties_collector_factory_t { + std::shared_ptr rep; +}; struct rocksdb_lru_cache_options_t { LRUCacheOptions rep; }; @@ -226,6 +265,12 @@ struct rocksdb_cache_t { struct rocksdb_write_buffer_manager_t { std::shared_ptr rep; }; +struct rocksdb_sst_file_manager_t { + std::shared_ptr rep; +}; +struct rocksdb_livefile_t { + LiveFileMetaData rep; +}; struct rocksdb_livefiles_t { std::vector rep; }; @@ -236,6 +281,12 @@ struct rocksdb_column_family_handle_t { struct rocksdb_column_family_metadata_t { ColumnFamilyMetaData rep; }; +struct rocksdb_export_import_files_metadata_t { + ExportImportFilesMetaData* rep; +}; +struct rocksdb_import_column_family_options_t { + ImportColumnFamilyOptions rep; +}; struct rocksdb_level_metadata_t { const LevelMetaData* rep; }; @@ -292,11 +343,49 @@ struct rocksdb_compactionfiltercontext_t { CompactionFilter::Context rep; }; +struct rocksdb_flushjobinfo_t { + FlushJobInfo rep; +}; +struct rocksdb_writestallcondition_t { + WriteStallCondition rep; +}; +struct rocksdb_writestallinfo_t { + WriteStallInfo rep; +}; +struct rocksdb_memtableinfo_t { + MemTableInfo rep; +}; +struct rocksdb_compactionjobinfo_t { + CompactionJobInfo rep; +}; +struct rocksdb_subcompactionjobinfo_t { + SubcompactionJobInfo rep; +}; +struct rocksdb_externalfileingestioninfo_t { + ExternalFileIngestionInfo rep; +}; + struct rocksdb_statistics_histogram_data_t { rocksdb_statistics_histogram_data_t() : rep() {} HistogramData rep; }; +struct rocksdb_compactionservice_scheduleresponse_t { + CompactionServiceScheduleResponse rep; +}; + +struct rocksdb_compactionservice_jobinfo_t { + CompactionServiceJobInfo rep; +}; + +struct rocksdb_compaction_service_options_override_t { + CompactionServiceOptionsOverride rep; +}; + +struct rocksdb_open_and_compact_options_t { + OpenAndCompactOptions rep; +}; + struct rocksdb_compactionfilter_t : public CompactionFilter { void* state_; void (*destructor_)(void*); @@ -507,7 +596,6 @@ struct rocksdb_slicetransform_t : public SliceTransform { char* (*transform_)(void*, const char* key, size_t length, size_t* dst_length); unsigned char (*in_domain_)(void*, const char* key, size_t length); - unsigned char (*in_range_)(void*, const char* key, size_t length); ~rocksdb_slicetransform_t() override { (*destructor_)(state_); } @@ -522,10 +610,6 @@ struct rocksdb_slicetransform_t : public SliceTransform { bool InDomain(const Slice& src) const override { return (*in_domain_)(state_, src.data(), src.size()); } - - bool InRange(const Slice& src) const override { - return (*in_range_)(state_, src.data(), src.size()); - } }; struct rocksdb_universal_compaction_options_t { @@ -583,21 +667,563 @@ static bool SaveError(char** errptr, const Status& s) { return true; } -// Copies str to a new malloc()-ed buffer. The buffer is not NUL terminated. -static char* CopyString(const std::string& str) { - char* result = reinterpret_cast(malloc(sizeof(char) * str.size())); - memcpy(result, str.data(), sizeof(char) * str.size()); +// Helper function to copy string data to a malloc'd buffer +// Works with std::string, Slice, and PinnableSlice through implicit conversion +static inline char* CopyString(const Slice& slice) { + char* result = reinterpret_cast(malloc(slice.size())); + memcpy(result, slice.data(), slice.size()); + return result; +} + +const char* rocksdb_compactionservice_jobinfo_t_get_db_name( + const rocksdb_compactionservice_jobinfo_t* info, size_t* len) { + *len = info->rep.db_name.size(); + return info->rep.db_name.data(); +} + +const char* rocksdb_compactionservice_jobinfo_t_get_db_id( + const rocksdb_compactionservice_jobinfo_t* info, size_t* len) { + *len = info->rep.db_id.size(); + return info->rep.db_id.data(); +} + +const char* rocksdb_compactionservice_jobinfo_t_get_db_session_id( + const rocksdb_compactionservice_jobinfo_t* info, size_t* len) { + *len = info->rep.db_session_id.size(); + return info->rep.db_session_id.data(); +} + +const char* rocksdb_compactionservice_jobinfo_t_get_cf_name( + const rocksdb_compactionservice_jobinfo_t* info, size_t* len) { + *len = info->rep.cf_name.size(); + return info->rep.cf_name.data(); +} + +uint32_t rocksdb_compactionservice_jobinfo_t_get_cf_id( + const rocksdb_compactionservice_jobinfo_t* info) { + return info->rep.cf_id; +} + +uint64_t rocksdb_compactionservice_jobinfo_t_get_job_id( + const rocksdb_compactionservice_jobinfo_t* info) { + return info->rep.job_id; +} + +int rocksdb_compactionservice_jobinfo_t_get_priority( + const rocksdb_compactionservice_jobinfo_t* info) { + return static_cast(info->rep.priority); +} + +int rocksdb_compactionservice_jobinfo_t_get_compaction_reason( + const rocksdb_compactionservice_jobinfo_t* info) { + return static_cast(info->rep.compaction_reason); +} + +int rocksdb_compactionservice_jobinfo_t_get_base_input_level( + const rocksdb_compactionservice_jobinfo_t* info) { + return info->rep.base_input_level; +} + +int rocksdb_compactionservice_jobinfo_t_get_output_level( + const rocksdb_compactionservice_jobinfo_t* info) { + return info->rep.output_level; +} + +unsigned char rocksdb_compactionservice_jobinfo_t_is_full_compaction( + const rocksdb_compactionservice_jobinfo_t* info) { + return info->rep.is_full_compaction; +} + +unsigned char rocksdb_compactionservice_jobinfo_t_is_manual_compaction( + const rocksdb_compactionservice_jobinfo_t* info) { + return info->rep.is_manual_compaction; +} + +unsigned char rocksdb_compactionservice_jobinfo_t_is_bottommost_level( + const rocksdb_compactionservice_jobinfo_t* info) { + return info->rep.bottommost_level; +} + +// Helper function to validate compaction service job status +static inline bool IsValidCompactionServiceJobStatus(int status) { + return status >= rocksdb_compactionservice_jobstatus_success && + status <= rocksdb_compactionservice_jobstatus_use_local; +} + +rocksdb_compactionservice_scheduleresponse_t* +rocksdb_compactionservice_scheduleresponse_create(const char* scheduled_job_id, + int status, char** errptr) { + // Validate status is in range [success=0, failure=1, aborted=2, use_local=3] + if (!IsValidCompactionServiceJobStatus(status)) { + SaveError(errptr, + Status::InvalidArgument("Invalid status value. Must be 0-3.")); + return nullptr; + } + + rocksdb_compactionservice_scheduleresponse_t* response = + new rocksdb_compactionservice_scheduleresponse_t{ + CompactionServiceScheduleResponse( + scheduled_job_id ? std::string(scheduled_job_id) : "", + static_cast(status))}; + return response; +} + +rocksdb_compactionservice_scheduleresponse_t* +rocksdb_compactionservice_scheduleresponse_create_with_status(int status, + char** errptr) { + // Validate status is in range [success=0, failure=1, aborted=2, use_local=3] + if (!IsValidCompactionServiceJobStatus(status)) { + SaveError(errptr, + Status::InvalidArgument("Invalid status value. Must be 0-3.")); + return nullptr; + } + + rocksdb_compactionservice_scheduleresponse_t* response = + new rocksdb_compactionservice_scheduleresponse_t{ + CompactionServiceScheduleResponse( + static_cast(status))}; + return response; +} + +void rocksdb_compactionservice_scheduleresponse_t_destroy( + rocksdb_compactionservice_scheduleresponse_t* response) { + if (response) { + delete response; + } +} + +int rocksdb_compactionservice_scheduleresponse_getstatus( + const rocksdb_compactionservice_scheduleresponse_t* response) { + if (!response) { + return rocksdb_compactionservice_jobstatus_failure; + } + return static_cast(response->rep.status); +} + +const char* rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id( + const rocksdb_compactionservice_scheduleresponse_t* response, size_t* len) { + if (!response || !len) { + if (len) { + *len = 0; + } + return ""; + } + *len = response->rep.scheduled_job_id.size(); + return response->rep.scheduled_job_id.data(); +} + +struct rocksdb_compactionservice_t : public CompactionService { + void* state_; + void (*destructor_)(void*); + rocksdb_compaction_service_schedule_cb schedule_; + std::string name_; + rocksdb_compaction_service_wait_cb wait_; + rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs_; + rocksdb_compaction_service_on_installation_cb on_installation_; + + rocksdb_compactionservice_t( + void* state, void (*destructor)(void*), + rocksdb_compaction_service_schedule_cb + rocksdb_compaction_service_schedule_ptr, + const char* name, rocksdb_compaction_service_wait_cb wait, + rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs, + rocksdb_compaction_service_on_installation_cb on_installation) + : state_(state), + destructor_(destructor), + schedule_(rocksdb_compaction_service_schedule_ptr), + name_(name ? name : "CompactionService"), + wait_(wait), + cancel_awaiting_jobs_(cancel_awaiting_jobs), + on_installation_(on_installation) {} + + ~rocksdb_compactionservice_t() override { + if (destructor_) { + (*destructor_)(state_); + } + } + + const char* Name() const override { return name_.c_str(); } + + CompactionServiceScheduleResponse Schedule( + const CompactionServiceJobInfo& info, + const std::string& compaction_service_input) override { + if (schedule_ == nullptr) { + return CompactionServiceScheduleResponse( + CompactionServiceJobStatus::kUseLocal); + } + + rocksdb_compactionservice_scheduleresponse_t* c_response = (*schedule_)( + state_, + reinterpret_cast(&info), + compaction_service_input.data(), compaction_service_input.size()); + + if (c_response == nullptr) { + return CompactionServiceScheduleResponse( + CompactionServiceJobStatus::kFailure); + } + + CompactionServiceScheduleResponse response = std::move(c_response->rep); + delete c_response; + return response; + } + + CompactionServiceJobStatus Wait(const std::string& scheduled_job_id, + std::string* result) override { + if (wait_ == nullptr) { + return CompactionServiceJobStatus::kUseLocal; + } + + char* c_result = nullptr; + size_t result_len = 0; + + int status = + (*wait_)(state_, scheduled_job_id.c_str(), &c_result, &result_len); + + if (c_result != nullptr) { + if (result != nullptr) { + result->assign(c_result, result_len); + } + free(c_result); + } + + return static_cast(status); + } + + void CancelAwaitingJobs() override { + if (cancel_awaiting_jobs_ != nullptr) { + (*cancel_awaiting_jobs_)(state_); + } + } + + void OnInstallation(const std::string& scheduled_job_id, + CompactionServiceJobStatus status) override { + if (on_installation_ != nullptr) { + (*on_installation_)(state_, scheduled_job_id.c_str(), + static_cast(status)); + } + } +}; + +rocksdb_compactionservice_t* rocksdb_compactionservice_create( + void* state, void (*destructor)(void*), + rocksdb_compaction_service_schedule_cb schedule, const char* name, + rocksdb_compaction_service_wait_cb wait, + rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs, + rocksdb_compaction_service_on_installation_cb on_installation) { + return new rocksdb_compactionservice_t(state, destructor, schedule, name, + wait, cancel_awaiting_jobs, + on_installation); +} + +void rocksdb_options_set_compaction_service( + rocksdb_options_t* opt, rocksdb_compactionservice_t* service) { + if (!opt || !service) { + return; + } + + opt->rep.compaction_service = std::shared_ptr(service); +} + +// CompactionServiceOptionsOverride functions +rocksdb_compaction_service_options_override_t* +rocksdb_compaction_service_options_override_create() { + return new rocksdb_compaction_service_options_override_t; +} + +rocksdb_compaction_service_options_override_t* +rocksdb_compaction_service_options_override_create_from_options( + rocksdb_options_t* options) { + if (!options) { + return nullptr; + } + + rocksdb_compaction_service_options_override_t* override_opts = + new rocksdb_compaction_service_options_override_t; + + // Copy all relevant options from rocksdb_options_t + override_opts->rep.env = options->rep.env; + override_opts->rep.file_checksum_gen_factory = + options->rep.file_checksum_gen_factory; + override_opts->rep.comparator = options->rep.comparator; + override_opts->rep.merge_operator = options->rep.merge_operator; + override_opts->rep.compaction_filter = options->rep.compaction_filter; + override_opts->rep.compaction_filter_factory = + options->rep.compaction_filter_factory; + override_opts->rep.prefix_extractor = options->rep.prefix_extractor; + override_opts->rep.table_factory = options->rep.table_factory; + override_opts->rep.sst_partitioner_factory = + options->rep.sst_partitioner_factory; + override_opts->rep.listeners = options->rep.listeners; + override_opts->rep.statistics = options->rep.statistics; + override_opts->rep.info_log = options->rep.info_log; + override_opts->rep.table_properties_collector_factories = + options->rep.table_properties_collector_factories; + + return override_opts; +} + +void rocksdb_compaction_service_options_override_destroy( + rocksdb_compaction_service_options_override_t* override_options) { + if (override_options) { + delete override_options; + } +} + +void rocksdb_compaction_service_options_override_set_env( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_env_t* env) { + if (override_options && env) { + override_options->rep.env = env->rep; + } +} + +void rocksdb_compaction_service_options_override_set_comparator( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_comparator_t* comparator) { + if (override_options && comparator) { + override_options->rep.comparator = comparator; + } +} + +void rocksdb_compaction_service_options_override_set_merge_operator( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_mergeoperator_t* merge_operator) { + if (override_options && merge_operator) { + override_options->rep.merge_operator = + std::shared_ptr(merge_operator); + } +} + +void rocksdb_compaction_service_options_override_set_compaction_filter( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_compactionfilter_t* compaction_filter) { + if (override_options && compaction_filter) { + override_options->rep.compaction_filter = compaction_filter; + } +} + +void rocksdb_compaction_service_options_override_set_compaction_filter_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_compactionfilterfactory_t* compaction_filter_factory) { + if (override_options && compaction_filter_factory) { + override_options->rep.compaction_filter_factory = + std::shared_ptr(compaction_filter_factory); + } +} + +void rocksdb_compaction_service_options_override_set_prefix_extractor( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_slicetransform_t* prefix_extractor) { + if (override_options && prefix_extractor) { + override_options->rep.prefix_extractor = + std::shared_ptr(prefix_extractor); + } +} + +void rocksdb_compaction_service_options_override_set_block_based_table_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_block_based_table_options_t* table_options) { + if (override_options && table_options) { + override_options->rep.table_factory = std::shared_ptr( + NewBlockBasedTableFactory(table_options->rep)); + } +} + +void rocksdb_compaction_service_options_override_set_cuckoo_table_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_cuckoo_table_options_t* table_options) { + if (override_options && table_options) { + override_options->rep.table_factory = std::shared_ptr( + NewCuckooTableFactory(table_options->rep)); + } +} + +// Note: add_event_listener is defined later after rocksdb_eventlistener_t +// struct + +void rocksdb_compaction_service_options_override_set_statistics( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_options_t* options) { + if (override_options && options) { + override_options->rep.statistics = options->rep.statistics; + } +} + +void rocksdb_compaction_service_options_override_set_info_log( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_logger_t* logger) { + if (override_options && logger) { + override_options->rep.info_log = logger->rep; + } +} + +void rocksdb_compaction_service_options_override_set_option( + rocksdb_compaction_service_options_override_t* override_options, + const char* key, const char* value) { + if (override_options && key && value) { + override_options->rep.options_map[std::string(key)] = std::string(value); + } +} + +void rocksdb_compaction_service_options_override_set_file_checksum_gen_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_file_checksum_gen_factory_t* factory) { + if (override_options && factory) { + override_options->rep.file_checksum_gen_factory = factory->rep; + } +} + +void rocksdb_compaction_service_options_override_set_sst_partitioner_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_sst_partitioner_factory_t* factory) { + if (override_options && factory) { + override_options->rep.sst_partitioner_factory = factory->rep; + } +} + +void rocksdb_compaction_service_options_override_add_table_properties_collector_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_table_properties_collector_factory_t* factory) { + if (override_options && factory) { + override_options->rep.table_properties_collector_factories.push_back( + factory->rep); + } +} + +// Atomic bool management for cancellation +unsigned char* rocksdb_open_and_compact_canceled_create() { + return reinterpret_cast(new std::atomic(false)); +} + +void rocksdb_open_and_compact_canceled_destroy(unsigned char* canceled) { + if (canceled) { + delete reinterpret_cast*>(canceled); + } +} + +void rocksdb_open_and_compact_canceled_set(unsigned char* canceled, + unsigned char value) { + if (canceled) { + reinterpret_cast*>(canceled)->store(value != 0); + } +} + +// OpenAndCompactOptions functions +rocksdb_open_and_compact_options_t* rocksdb_open_and_compact_options_create() { + return new rocksdb_open_and_compact_options_t; +} + +void rocksdb_open_and_compact_options_destroy( + rocksdb_open_and_compact_options_t* options) { + if (options) { + delete options; + } +} + +void rocksdb_open_and_compact_options_set_canceled( + rocksdb_open_and_compact_options_t* options, unsigned char* canceled) { + if (options && canceled) { + options->rep.canceled = reinterpret_cast*>(canceled); + } +} + +void rocksdb_open_and_compact_options_set_allow_resumption( + rocksdb_open_and_compact_options_t* options, + unsigned char allow_resumption) { + if (options) { + options->rep.allow_resumption = allow_resumption != 0; + } +} + +// OpenAndCompact functions +char* rocksdb_open_and_compact( + const char* db_path, const char* output_directory, const char* input, + size_t input_len, size_t* output_len, + const rocksdb_compaction_service_options_override_t* override_options, + char** errptr) { + if (!db_path || !output_directory || !input || !override_options) { + SaveError(errptr, Status::InvalidArgument("Invalid arguments")); + return nullptr; + } + + std::string input_str(input, input_len); + std::string output_str; + + Status s = DB::OpenAndCompact(db_path, output_directory, input_str, + &output_str, override_options->rep); + + if (!s.ok()) { + SaveError(errptr, s); + return nullptr; + } + + // Allocate +1 for null terminator + char* result = static_cast(malloc(output_str.size() + 1)); + if (!result) { + SaveError(errptr, Status::MemoryLimit("Failed to allocate output buffer")); + return nullptr; + } + + memcpy(result, output_str.data(), output_str.size()); + result[output_str.size()] = '\0'; + + // Only set output_len after successful allocation + if (output_len) { + *output_len = output_str.size(); + } + + return result; +} + +char* rocksdb_open_and_compact_with_options( + const rocksdb_open_and_compact_options_t* options, const char* db_path, + const char* output_directory, const char* input, size_t input_len, + size_t* output_len, + const rocksdb_compaction_service_options_override_t* override_options, + char** errptr) { + if (!options || !db_path || !output_directory || !input || + !override_options) { + SaveError(errptr, Status::InvalidArgument("Invalid arguments")); + return nullptr; + } + + std::string input_str(input, input_len); + std::string output_str; + + Status s = DB::OpenAndCompact(options->rep, db_path, output_directory, + input_str, &output_str, override_options->rep); + + if (!s.ok()) { + SaveError(errptr, s); + return nullptr; + } + + // Allocate +1 for null terminator + char* result = static_cast(malloc(output_str.size() + 1)); + if (!result) { + SaveError(errptr, Status::MemoryLimit("Failed to allocate output buffer")); + return nullptr; + } + + memcpy(result, output_str.data(), output_str.size()); + result[output_str.size()] = '\0'; // Null terminate + + // Only set output_len after successful allocation + if (output_len) { + *output_len = output_str.size(); + } + return result; } rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name, char** errptr) { - DB* db; - if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { + std::unique_ptr dbptr; + if (SaveError(errptr, DB::Open(options->rep, std::string(name), &dbptr))) { return nullptr; } rocksdb_t* result = new rocksdb_t; - result->rep = db; + result->rep = dbptr.release(); return result; } @@ -617,13 +1243,14 @@ rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options, const char* name, unsigned char error_if_wal_file_exists, char** errptr) { - DB* db; - if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), - &db, error_if_wal_file_exists))) { + std::unique_ptr dbptr; + if (SaveError(errptr, + DB::OpenForReadOnly(options->rep, std::string(name), &dbptr, + error_if_wal_file_exists))) { return nullptr; } rocksdb_t* result = new rocksdb_t; - result->rep = db; + result->rep = dbptr.release(); return result; } @@ -631,14 +1258,14 @@ rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options, const char* name, const char* secondary_path, char** errptr) { - DB* db; + std::unique_ptr dbptr; if (SaveError(errptr, DB::OpenAsSecondary(options->rep, std::string(name), - std::string(secondary_path), &db))) { + std::string(secondary_path), &dbptr))) { return nullptr; } rocksdb_t* result = new rocksdb_t; - result->rep = db; + result->rep = dbptr.release(); return result; } @@ -884,6 +1511,10 @@ void rocksdb_backup_engine_options_destroy( delete options; } +void rocksdb_status_ptr_get_error(rocksdb_status_ptr_t* status, char** errptr) { + SaveError(errptr, *(status->rep)); +} + rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr) { Checkpoint* checkpoint; @@ -902,6 +1533,22 @@ void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint, std::string(checkpoint_dir), log_size_for_flush)); } +rocksdb_export_import_files_metadata_t* rocksdb_checkpoint_export_column_family( + rocksdb_checkpoint_t* checkpoint, + rocksdb_column_family_handle_t* column_family, const char* export_dir, + char** errptr) { + ExportImportFilesMetaData* metadata = nullptr; + if (SaveError(errptr, + checkpoint->rep->ExportColumnFamily( + column_family->rep, std::string(export_dir), &metadata))) { + return nullptr; + } + rocksdb_export_import_files_metadata_t* result = + new rocksdb_export_import_files_metadata_t; + result->rep = metadata; + return result; +} + void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) { delete checkpoint->rep; delete checkpoint; @@ -932,11 +1579,11 @@ rocksdb_t* rocksdb_open_and_trim_history( std::string trim_ts_(trim_ts, trim_tslen); - DB* db; + std::unique_ptr dbptr; std::vector handles; if (SaveError(errptr, DB::OpenAndTrimHistory( DBOptions(db_options->rep), std::string(name), - column_families, &handles, &db, trim_ts_))) { + column_families, &handles, &dbptr, trim_ts_))) { return nullptr; } @@ -948,7 +1595,7 @@ rocksdb_t* rocksdb_open_and_trim_history( column_family_handles[i] = c_handle; } rocksdb_t* result = new rocksdb_t; - result->rep = db; + result->rep = dbptr.release(); return result; } @@ -964,10 +1611,10 @@ rocksdb_t* rocksdb_open_column_families( ColumnFamilyOptions(column_family_options[i]->rep)); } - DB* db; + std::unique_ptr dbptr; std::vector handles; if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name), - column_families, &handles, &db))) { + column_families, &handles, &dbptr))) { return nullptr; } @@ -979,7 +1626,7 @@ rocksdb_t* rocksdb_open_column_families( column_family_handles[i] = c_handle; } rocksdb_t* result = new rocksdb_t; - result->rep = db; + result->rep = dbptr.release(); return result; } @@ -1032,12 +1679,12 @@ rocksdb_t* rocksdb_open_for_read_only_column_families( ColumnFamilyOptions(column_family_options[i]->rep)); } - DB* db; + std::unique_ptr dbptr; std::vector handles; - if (SaveError(errptr, - DB::OpenForReadOnly(DBOptions(db_options->rep), - std::string(name), column_families, - &handles, &db, error_if_wal_file_exists))) { + if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep), + std::string(name), column_families, + &handles, &dbptr, + error_if_wal_file_exists))) { return nullptr; } @@ -1049,7 +1696,7 @@ rocksdb_t* rocksdb_open_for_read_only_column_families( column_family_handles[i] = c_handle; } rocksdb_t* result = new rocksdb_t; - result->rep = db; + result->rep = dbptr.release(); return result; } @@ -1065,12 +1712,12 @@ rocksdb_t* rocksdb_open_as_secondary_column_families( std::string(column_family_names[i]), ColumnFamilyOptions(column_family_options[i]->rep)); } - DB* db; + std::unique_ptr dbptr; std::vector handles; - if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep), - std::string(name), - std::string(secondary_path), - column_families, &handles, &db))) { + if (SaveError(errptr, DB::OpenAsSecondary( + DBOptions(db_options->rep), std::string(name), + std::string(secondary_path), column_families, + &handles, &dbptr))) { return nullptr; } for (size_t i = 0; i != handles.size(); ++i) { @@ -1081,7 +1728,7 @@ rocksdb_t* rocksdb_open_as_secondary_column_families( column_family_handles[i] = c_handle; } rocksdb_t* result = new rocksdb_t; - result->rep = db; + result->rep = dbptr.release(); return result; } @@ -1145,6 +1792,26 @@ rocksdb_column_family_handle_t** rocksdb_create_column_families( return c_handles; } +rocksdb_column_family_handle_t* rocksdb_create_column_family_with_import( + rocksdb_t* db, rocksdb_options_t* column_family_options, + const char* column_family_name, + rocksdb_import_column_family_options_t* import_options, + rocksdb_export_import_files_metadata_t* export_import_files_metadata, + char** errptr) { + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + handle->rep = nullptr; + if (SaveError(errptr, + db->rep->CreateColumnFamilyWithImport( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), import_options->rep, + *(export_import_files_metadata->rep), &(handle->rep)))) { + delete handle; + return nullptr; + } + handle->immortal = false; + return handle; +} + void rocksdb_create_column_families_destroy( rocksdb_column_family_handle_t** list) { free(list); @@ -1348,11 +2015,14 @@ char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, size_t keylen, size_t* vallen, char** errptr) { char* result = nullptr; - std::string tmp; - Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + // Use PinnableSlice to avoid unnecessary copy + PinnableSlice pinnable_val; + Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(), + Slice(key, keylen), &pinnable_val); if (s.ok()) { - *vallen = tmp.size(); - result = CopyString(tmp); + *vallen = pinnable_val.size(); + // Only one copy: from PinnableSlice to malloc'd buffer + result = CopyString(pinnable_val); } else { *vallen = 0; if (!s.IsNotFound()) { @@ -1367,12 +2037,14 @@ char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, size_t keylen, size_t* vallen, char** errptr) { char* result = nullptr; - std::string tmp; - Status s = - db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp); + // Use PinnableSlice to avoid unnecessary copy + PinnableSlice pinnable_val; + Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), + &pinnable_val); if (s.ok()) { - *vallen = tmp.size(); - result = CopyString(tmp); + *vallen = pinnable_val.size(); + // Only one copy: from PinnableSlice to malloc'd buffer + result = CopyString(pinnable_val); } else { *vallen = 0; if (!s.IsNotFound()) { @@ -1445,12 +2117,17 @@ void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys, const char* const* keys_list, const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs) { - std::vector keys(num_keys); + // Use unique_ptr for efficiency (avoids vector overhead for fixed-size array) + std::unique_ptr keys(new Slice[num_keys]); for (size_t i = 0; i < num_keys; i++) { keys[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector values(num_keys); - std::vector statuses = db->rep->MultiGet(options->rep, keys, &values); + // Use PinnableSlice to avoid unnecessary allocations + auto cfh = db->rep->DefaultColumnFamily(); + std::vector values(num_keys); + std::vector statuses(num_keys); + db->rep->MultiGet(options->rep, cfh, num_keys, keys.get(), values.data(), + statuses.data()); for (size_t i = 0; i < num_keys; i++) { if (statuses[i].ok()) { values_list[i] = CopyString(values[i]); @@ -1475,10 +2152,13 @@ void rocksdb_multi_get_with_ts(rocksdb_t* db, char** values_list, size_t* values_list_sizes, char** timestamp_list, size_t* timestamp_list_sizes, char** errs) { - std::vector keys(num_keys); + // Use unique_ptr for efficiency + std::unique_ptr keys_arr(new Slice[num_keys]); for (size_t i = 0; i < num_keys; i++) { - keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]); } + // Note: MultiGet with timestamps only has vector-based API + std::vector keys(keys_arr.get(), keys_arr.get() + num_keys); std::vector values(num_keys); std::vector timestamps(num_keys); std::vector statuses = @@ -1510,15 +2190,19 @@ void rocksdb_multi_get_cf( size_t num_keys, const char* const* keys_list, const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs) { - std::vector keys(num_keys); - std::vector cfs(num_keys); + // Use unique_ptr for efficiency (avoids vector overhead for fixed-size + // arrays) + std::unique_ptr keys(new Slice[num_keys]); + std::unique_ptr cfs(new ColumnFamilyHandle*[num_keys]); for (size_t i = 0; i < num_keys; i++) { keys[i] = Slice(keys_list[i], keys_list_sizes[i]); cfs[i] = column_families[i]->rep; } - std::vector values(num_keys); - std::vector statuses = - db->rep->MultiGet(options->rep, cfs, keys, &values); + // Use PinnableSlice to avoid unnecessary allocations + std::vector values(num_keys); + std::vector statuses(num_keys); + db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(), + values.data(), statuses.data()); for (size_t i = 0; i < num_keys; i++) { if (statuses[i].ok()) { values_list[i] = CopyString(values[i]); @@ -1543,16 +2227,20 @@ void rocksdb_multi_get_cf_with_ts( const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** timestamps_list, size_t* timestamps_list_sizes, char** errs) { - std::vector keys(num_keys); - std::vector cfs(num_keys); + // Use unique_ptr for efficiency (avoids vector overhead for fixed-size + // arrays) + std::unique_ptr keys(new Slice[num_keys]); + std::unique_ptr cfs(new ColumnFamilyHandle*[num_keys]); for (size_t i = 0; i < num_keys; i++) { keys[i] = Slice(keys_list[i], keys_list_sizes[i]); cfs[i] = column_families[i]->rep; } - std::vector values(num_keys); + // Use PinnableSlice to avoid unnecessary allocations + std::vector values(num_keys); std::vector timestamps(num_keys); - std::vector statuses = - db->rep->MultiGet(options->rep, cfs, keys, &values, ×tamps); + std::vector statuses(num_keys); + db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(), + values.data(), timestamps.data(), statuses.data()); for (size_t i = 0; i < num_keys; i++) { if (statuses[i].ok()) { values_list[i] = CopyString(values[i]); @@ -1611,6 +2299,41 @@ void rocksdb_batched_multi_get_cf(rocksdb_t* db, delete[] statuses; } +// Batched MultiGet that takes pre-built Slice array, avoiding key conversion +// overhead +void rocksdb_batched_multi_get_cf_slice( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, size_t num_keys, + const rocksdb_slice_t* keys_list, rocksdb_pinnableslice_t** values, + char** errs, const bool sorted_input) { + PinnableSlice* value_slices = new PinnableSlice[num_keys]; + Status* statuses = new Status[num_keys]; + + // Cast rocksdb_slice_t* to Slice* - they have identical memory layout + const Slice* key_slices = reinterpret_cast(keys_list); + + db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices, + value_slices, statuses, sorted_input); + + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + values[i] = new (rocksdb_pinnableslice_t); + values[i]->rep = std::move(value_slices[i]); + errs[i] = nullptr; + } else { + values[i] = nullptr; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } + + delete[] value_slices; + delete[] statuses; +} + unsigned char rocksdb_key_may_exist(rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, size_t key_len, @@ -2031,6 +2754,32 @@ void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { SaveError(errptr, iter->rep->status()); } +// Iterator functions that return rocksdb_slice_t directly for better +// performance +rocksdb_slice_t rocksdb_iter_key_slice(const rocksdb_iterator_t* iter) { + Slice s = iter->rep->key(); + rocksdb_slice_t result; + result.data = s.data(); + result.size = s.size(); + return result; +} + +rocksdb_slice_t rocksdb_iter_value_slice(const rocksdb_iterator_t* iter) { + Slice s = iter->rep->value(); + rocksdb_slice_t result; + result.data = s.data(); + result.size = s.size(); + return result; +} + +rocksdb_slice_t rocksdb_iter_timestamp_slice(const rocksdb_iterator_t* iter) { + Slice s = iter->rep->timestamp(); + rocksdb_slice_t result; + result.data = s.data(); + result.size = s.size(); + return result; +} + void rocksdb_iter_refresh(const rocksdb_iterator_t* iter, char** errptr) { SaveError(errptr, iter->rep->Refresh()); } @@ -2086,16 +2835,18 @@ void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr instead of vector to avoid overhead + // Safe because WriteBatch::Put immediately copies the data + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector value_slices(num_values); + std::unique_ptr value_slices(new Slice[num_values]); for (int i = 0; i < num_values; i++) { value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } - b->rep.Put(SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + b->rep.Put(SliceParts(key_slices.get(), num_keys), + SliceParts(value_slices.get(), num_values)); } void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b, @@ -2104,16 +2855,18 @@ void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr instead of vector to avoid overhead + // Safe because WriteBatch::Put immediately copies the data + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector value_slices(num_values); + std::unique_ptr value_slices(new Slice[num_values]); for (int i = 0; i < num_values; i++) { value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } - b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + b->rep.Put(column_family->rep, SliceParts(key_slices.get(), num_keys), + SliceParts(value_slices.get(), num_values)); } void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key, @@ -2133,16 +2886,18 @@ void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr instead of vector to avoid overhead + // Safe because WriteBatch::Merge immediately copies the data + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector value_slices(num_values); + std::unique_ptr value_slices(new Slice[num_values]); for (int i = 0; i < num_values; i++) { value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } - b->rep.Merge(SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + b->rep.Merge(SliceParts(key_slices.get(), num_keys), + SliceParts(value_slices.get(), num_values)); } void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b, @@ -2151,16 +2906,18 @@ void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr instead of vector to avoid overhead + // Safe because WriteBatch::Merge immediately copies the data + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector value_slices(num_values); + std::unique_ptr value_slices(new Slice[num_values]); for (int i = 0; i < num_values; i++) { value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } - b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + b->rep.Merge(column_family->rep, SliceParts(key_slices.get(), num_keys), + SliceParts(value_slices.get(), num_values)); } void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key, @@ -2200,21 +2957,25 @@ void rocksdb_writebatch_singledelete_cf_with_ts( void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr instead of vector to avoid overhead + // Safe because WriteBatch::Delete immediately copies the data + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - b->rep.Delete(SliceParts(key_slices.data(), num_keys)); + b->rep.Delete(SliceParts(key_slices.get(), num_keys)); } void rocksdb_writebatch_deletev_cf( rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr instead of vector to avoid overhead + // Safe because WriteBatch::Delete immediately copies the data + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys)); + b->rep.Delete(column_family->rep, SliceParts(key_slices.get(), num_keys)); } void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b, @@ -2238,14 +2999,16 @@ void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys, const size_t* start_keys_list_sizes, const char* const* end_keys_list, const size_t* end_keys_list_sizes) { - std::vector start_key_slices(num_keys); - std::vector end_key_slices(num_keys); + // Use unique_ptr instead of vector to avoid overhead + // Safe because WriteBatch::DeleteRange immediately copies the data + std::unique_ptr start_key_slices(new Slice[num_keys]); + std::unique_ptr end_key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); } - b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys), - SliceParts(end_key_slices.data(), num_keys)); + b->rep.DeleteRange(SliceParts(start_key_slices.get(), num_keys), + SliceParts(end_key_slices.get(), num_keys)); } void rocksdb_writebatch_delete_rangev_cf( @@ -2253,15 +3016,17 @@ void rocksdb_writebatch_delete_rangev_cf( int num_keys, const char* const* start_keys_list, const size_t* start_keys_list_sizes, const char* const* end_keys_list, const size_t* end_keys_list_sizes) { - std::vector start_key_slices(num_keys); - std::vector end_key_slices(num_keys); + // Use unique_ptr instead of vector to avoid overhead + // Safe because WriteBatch::DeleteRange immediately copies the data + std::unique_ptr start_key_slices(new Slice[num_keys]); + std::unique_ptr end_key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); } b->rep.DeleteRange(column_family->rep, - SliceParts(start_key_slices.data(), num_keys), - SliceParts(end_key_slices.data(), num_keys)); + SliceParts(start_key_slices.get(), num_keys), + SliceParts(end_key_slices.get(), num_keys)); } void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob, @@ -2274,12 +3039,19 @@ class H : public WriteBatch::Handler { void* state_; void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); void (*deleted_)(void*, const char* k, size_t klen); + void (*log_data_)(void*, const char* blob, size_t blob_len); + void Put(const Slice& key, const Slice& value) override { (*put_)(state_, key.data(), key.size(), value.data(), value.size()); } void Delete(const Slice& key) override { (*deleted_)(state_, key.data(), key.size()); } + void LogData(const Slice& blob) override { + if (log_data_) { + (*log_data_)(state_, blob.data(), blob.size()); + } + } }; class HCF : public WriteBatch::Handler { @@ -2290,6 +3062,8 @@ class HCF : public WriteBatch::Handler { void (*deleted_cf_)(void*, uint32_t cfid, const char* k, size_t klen); void (*merge_cf_)(void*, uint32_t cfid, const char* k, size_t klen, const char* v, size_t vlen); + void (*log_data_)(void*, const char* blob, size_t blob_len); + Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { (*put_cf_)(state_, column_family_id, key.data(), key.size(), value.data(), @@ -2306,6 +3080,11 @@ class HCF : public WriteBatch::Handler { value.size()); return Status::OK(); } + void LogData(const Slice& blob) override { + if (log_data_) { + (*log_data_)(state_, blob.data(), blob.size()); + } + } }; void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state, @@ -2317,6 +3096,20 @@ void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state, handler.state_ = state; handler.put_ = put; handler.deleted_ = deleted; + handler.log_data_ = nullptr; + b->rep.Iterate(&handler); +} + +void rocksdb_writebatch_iterate_ld( + rocksdb_writebatch_t* b, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen), + void (*log_data)(void*, const char* blob, size_t blob_len)) { + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + handler.log_data_ = log_data; b->rep.Iterate(&handler); } @@ -2332,6 +3125,24 @@ void rocksdb_writebatch_iterate_cf( handler.put_cf_ = put_cf; handler.deleted_cf_ = deleted_cf; handler.merge_cf_ = merge_cf; + handler.log_data_ = nullptr; + b->rep.Iterate(&handler); +} + +void rocksdb_writebatch_iterate_cf_ld( + rocksdb_writebatch_t* b, void* state, + void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen, + const char* v, size_t vlen), + void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen), + void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen, + const char* v, size_t vlen), + void (*log_data)(void*, const char* blob, size_t blob_len)) { + HCF handler; + handler.state_ = state; + handler.put_cf_ = put_cf; + handler.deleted_cf_ = deleted_cf; + handler.merge_cf_ = merge_cf; + handler.log_data_ = log_data; b->rep.Iterate(&handler); } @@ -2422,16 +3233,17 @@ void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr for better performance (avoids vector overhead) + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector value_slices(num_values); + std::unique_ptr value_slices(new Slice[num_values]); for (int i = 0; i < num_values; i++) { value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } - b->rep->Put(SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + b->rep->Put(SliceParts(key_slices.get(), num_keys), + SliceParts(value_slices.get(), num_values)); } void rocksdb_writebatch_wi_putv_cf( @@ -2467,16 +3279,17 @@ void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr for better performance (avoids vector overhead) + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector value_slices(num_values); + std::unique_ptr value_slices(new Slice[num_values]); for (int i = 0; i < num_values; i++) { value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } - b->rep->Merge(SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + b->rep->Merge(SliceParts(key_slices.get(), num_keys), + SliceParts(value_slices.get(), num_values)); } void rocksdb_writebatch_wi_mergev_cf( @@ -2484,16 +3297,17 @@ void rocksdb_writebatch_wi_mergev_cf( int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr for better performance (avoids vector overhead) + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector value_slices(num_values); + std::unique_ptr value_slices(new Slice[num_values]); for (int i = 0; i < num_values; i++) { value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } - b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + b->rep->Merge(column_family->rep, SliceParts(key_slices.get(), num_keys), + SliceParts(value_slices.get(), num_values)); } void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key, @@ -2531,11 +3345,12 @@ void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys, void rocksdb_writebatch_wi_deletev_cf( rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) { - std::vector key_slices(num_keys); + // Use unique_ptr for better performance (avoids vector overhead) + std::unique_ptr key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); } - b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys)); + b->rep->Delete(column_family->rep, SliceParts(key_slices.get(), num_keys)); } void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b, @@ -2561,14 +3376,15 @@ void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, const size_t* start_keys_list_sizes, const char* const* end_keys_list, const size_t* end_keys_list_sizes) { - std::vector start_key_slices(num_keys); - std::vector end_key_slices(num_keys); + // Use unique_ptr for better performance (avoids vector overhead) + std::unique_ptr start_key_slices(new Slice[num_keys]); + std::unique_ptr end_key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); } - b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys), - SliceParts(end_key_slices.data(), num_keys)); + b->rep->DeleteRange(SliceParts(start_key_slices.get(), num_keys), + SliceParts(end_key_slices.get(), num_keys)); } void rocksdb_writebatch_wi_delete_rangev_cf( @@ -2576,15 +3392,16 @@ void rocksdb_writebatch_wi_delete_rangev_cf( int num_keys, const char* const* start_keys_list, const size_t* start_keys_list_sizes, const char* const* end_keys_list, const size_t* end_keys_list_sizes) { - std::vector start_key_slices(num_keys); - std::vector end_key_slices(num_keys); + // Use unique_ptr for better performance (avoids vector overhead) + std::unique_ptr start_key_slices(new Slice[num_keys]); + std::unique_ptr end_key_slices(new Slice[num_keys]); for (int i = 0; i < num_keys; i++) { start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); } b->rep->DeleteRange(column_family->rep, - SliceParts(start_key_slices.data(), num_keys), - SliceParts(end_key_slices.data(), num_keys)); + SliceParts(start_key_slices.get(), num_keys), + SliceParts(end_key_slices.get(), num_keys)); } void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b, @@ -2627,6 +3444,16 @@ rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base( return result; } +rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_readopts( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = + wbwi->rep->NewIteratorWithBase(base_iterator->rep, &options->rep); + delete base_iterator; + return result; +} + rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, rocksdb_column_family_handle_t* column_family) { @@ -2637,6 +3464,17 @@ rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( return result; } +rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + rocksdb_column_family_handle_t* column_family, + const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = wbwi->rep->NewIteratorWithBase( + column_family->rep, base_iterator->rep, &options->rep); + delete base_iterator; + return result; +} + char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, const char* key, size_t keylen, @@ -2681,12 +3519,13 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db( const rocksdb_readoptions_t* options, const char* key, size_t keylen, size_t* vallen, char** errptr) { char* result = nullptr; - std::string tmp; + // Use PinnableSlice to avoid unnecessary allocations + PinnableSlice pinnable_val; Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, - Slice(key, keylen), &tmp); + Slice(key, keylen), &pinnable_val); if (s.ok()) { - *vallen = tmp.size(); - result = CopyString(tmp); + *vallen = pinnable_val.size(); + result = CopyString(pinnable_val); } else { *vallen = 0; if (!s.IsNotFound()) { @@ -2696,18 +3535,37 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db( return result; } +rocksdb_pinnableslice_t* rocksdb_writebatch_wi_get_pinned_from_batch_and_db( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, + char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, + Slice(key, keylen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, const rocksdb_readoptions_t* options, rocksdb_column_family_handle_t* column_family, const char* key, size_t keylen, size_t* vallen, char** errptr) { char* result = nullptr; - std::string tmp; - Status s = wbwi->rep->GetFromBatchAndDB( - db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp); + // Use PinnableSlice to avoid unnecessary allocations + PinnableSlice pinnable_val; + Status s = + wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, column_family->rep, + Slice(key, keylen), &pinnable_val); if (s.ok()) { - *vallen = tmp.size(); - result = CopyString(tmp); + *vallen = pinnable_val.size(); + result = CopyString(pinnable_val); } else { *vallen = 0; if (!s.IsNotFound()) { @@ -2717,6 +3575,24 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( return result; } +rocksdb_pinnableslice_t* rocksdb_writebatch_wi_get_pinned_from_batch_and_db_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = wbwi->rep->GetFromBatchAndDB( + db->rep, options->rep, column_family->rep, Slice(key, keylen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + void rocksdb_write_writebatch_wi(rocksdb_t* db, const rocksdb_writeoptions_t* options, rocksdb_writebatch_wi_t* wbwi, char** errptr) { @@ -2878,6 +3754,12 @@ void rocksdb_block_based_options_set_data_block_index_type( static_cast(v); } +void rocksdb_block_based_options_set_index_block_search_type( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.index_block_search_type = + static_cast(v); +} + void rocksdb_block_based_options_set_data_block_hash_ratio( rocksdb_block_based_table_options_t* options, double v) { options->rep.data_block_hash_table_util_ratio = v; @@ -2924,10 +3806,379 @@ void rocksdb_block_based_options_set_partition_pinning_tier( static_cast(v); } -void rocksdb_block_based_options_set_unpartitioned_pinning_tier( - rocksdb_block_based_table_options_t* options, int v) { - options->rep.metadata_cache_options.unpartitioned_pinning = - static_cast(v); +void rocksdb_block_based_options_set_unpartitioned_pinning_tier( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.metadata_cache_options.unpartitioned_pinning = + static_cast(v); +} + +void rocksdb_block_based_options_set_block_align( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.block_align = v; +} + +/* FlushJobInfo */ + +const char* rocksdb_flushjobinfo_cf_name(const rocksdb_flushjobinfo_t* info, + size_t* size) { + *size = info->rep.cf_name.size(); + return info->rep.cf_name.data(); +} + +const char* rocksdb_flushjobinfo_file_path(const rocksdb_flushjobinfo_t* info, + size_t* size) { + *size = info->rep.file_path.size(); + return info->rep.file_path.data(); +} + +unsigned char rocksdb_flushjobinfo_triggered_writes_slowdown( + const rocksdb_flushjobinfo_t* info) { + return info->rep.triggered_writes_slowdown; +} + +unsigned char rocksdb_flushjobinfo_triggered_writes_stop( + const rocksdb_flushjobinfo_t* info) { + return info->rep.triggered_writes_stop; +} + +uint64_t rocksdb_flushjobinfo_largest_seqno( + const rocksdb_flushjobinfo_t* info) { + return info->rep.largest_seqno; +} + +uint64_t rocksdb_flushjobinfo_smallest_seqno( + const rocksdb_flushjobinfo_t* info) { + return info->rep.smallest_seqno; +} + +uint32_t rocksdb_flushjobinfo_flush_reason(const rocksdb_flushjobinfo_t* info) { + return static_cast(info->rep.flush_reason); +} + +void rocksdb_reset_status(rocksdb_status_ptr_t* status_ptr) { + auto ptr = status_ptr->rep; + *ptr = Status::OK(); +} + +/* CompactionJobInfo */ + +void rocksdb_compactionjobinfo_status(const rocksdb_compactionjobinfo_t* info, + char** errptr) { + SaveError(errptr, info->rep.status); +} + +const char* rocksdb_compactionjobinfo_cf_name( + const rocksdb_compactionjobinfo_t* info, size_t* size) { + *size = info->rep.cf_name.size(); + return info->rep.cf_name.data(); +} + +size_t rocksdb_compactionjobinfo_input_files_count( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.input_files.size(); +} + +const char* rocksdb_compactionjobinfo_input_file_at( + const rocksdb_compactionjobinfo_t* info, size_t pos, size_t* size) { + assert(info != nullptr); + assert(pos < info->rep.input_files.size()); + + const std::string& path = info->rep.input_files[pos]; + *size = path.size(); + return path.data(); +} + +size_t rocksdb_compactionjobinfo_output_files_count( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.output_files.size(); +} + +const char* rocksdb_compactionjobinfo_output_file_at( + const rocksdb_compactionjobinfo_t* info, size_t pos, size_t* size) { + assert(info != nullptr); + assert(pos < info->rep.output_files.size()); + + const std::string& path = info->rep.output_files[pos]; + *size = path.size(); + return path.data(); +} + +uint64_t rocksdb_compactionjobinfo_elapsed_micros( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.stats.elapsed_micros; +} + +uint64_t rocksdb_compactionjobinfo_num_corrupt_keys( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.stats.num_corrupt_keys; +} + +int rocksdb_compactionjobinfo_base_input_level( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.base_input_level; +} + +int rocksdb_compactionjobinfo_output_level( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.output_level; +} + +size_t rocksdb_compactionjobinfo_num_input_files( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.stats.num_input_files; +} + +size_t rocksdb_compactionjobinfo_num_input_files_at_output_level( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.stats.num_input_files_at_output_level; +} + +uint64_t rocksdb_compactionjobinfo_input_records( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.stats.num_input_records; +} + +uint64_t rocksdb_compactionjobinfo_output_records( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.stats.num_output_records; +} + +uint64_t rocksdb_compactionjobinfo_total_input_bytes( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.stats.total_input_bytes; +} + +uint64_t rocksdb_compactionjobinfo_total_output_bytes( + const rocksdb_compactionjobinfo_t* info) { + return info->rep.stats.total_output_bytes; +} + +uint32_t rocksdb_compactionjobinfo_compaction_reason( + const rocksdb_compactionjobinfo_t* info) { + return static_cast(info->rep.compaction_reason); +} + +/* SubcompactionJobInfo */ + +void rocksdb_subcompactionjobinfo_status( + const rocksdb_subcompactionjobinfo_t* info, char** errptr) { + SaveError(errptr, info->rep.status); +} + +const char* rocksdb_subcompactionjobinfo_cf_name( + const rocksdb_subcompactionjobinfo_t* info, size_t* size) { + *size = info->rep.cf_name.size(); + return info->rep.cf_name.data(); +} + +uint64_t rocksdb_subcompactionjobinfo_thread_id( + const rocksdb_subcompactionjobinfo_t* info) { + return info->rep.thread_id; +} + +int rocksdb_subcompactionjobinfo_base_input_level( + const rocksdb_subcompactionjobinfo_t* info) { + return info->rep.base_input_level; +} + +int rocksdb_subcompactionjobinfo_output_level( + const rocksdb_subcompactionjobinfo_t* info) { + return info->rep.output_level; +} + +uint32_t rocksdb_subcompactionjobinfo_compaction_reason( + const rocksdb_subcompactionjobinfo_t* info) { + return static_cast(info->rep.compaction_reason); +} + +/* ExternalFileIngestionInfo */ + +const char* rocksdb_externalfileingestioninfo_cf_name( + const rocksdb_externalfileingestioninfo_t* info, size_t* size) { + *size = info->rep.cf_name.size(); + return info->rep.cf_name.data(); +} + +const char* rocksdb_externalfileingestioninfo_internal_file_path( + const rocksdb_externalfileingestioninfo_t* info, size_t* size) { + *size = info->rep.internal_file_path.size(); + return info->rep.internal_file_path.data(); +} + +/* External write stall info */ +extern ROCKSDB_LIBRARY_API const char* rocksdb_writestallinfo_cf_name( + const rocksdb_writestallinfo_t* info, size_t* size) { + *size = info->rep.cf_name.size(); + return info->rep.cf_name.data(); +} + +const rocksdb_writestallcondition_t* rocksdb_writestallinfo_cur( + const rocksdb_writestallinfo_t* info) { + return reinterpret_cast( + &info->rep.condition.cur); +} + +const rocksdb_writestallcondition_t* rocksdb_writestallinfo_prev( + const rocksdb_writestallinfo_t* info) { + return reinterpret_cast( + &info->rep.condition.prev); +} + +const char* rocksdb_memtableinfo_cf_name(const rocksdb_memtableinfo_t* info, + size_t* size) { + *size = info->rep.cf_name.size(); + return info->rep.cf_name.data(); +} + +uint64_t rocksdb_memtableinfo_first_seqno(const rocksdb_memtableinfo_t* info) { + return info->rep.first_seqno; +} +uint64_t rocksdb_memtableinfo_earliest_seqno( + const rocksdb_memtableinfo_t* info) { + return info->rep.earliest_seqno; +} +uint64_t rocksdb_memtableinfo_num_entries(const rocksdb_memtableinfo_t* info) { + return info->rep.num_entries; +} +uint64_t rocksdb_memtableinfo_num_deletes(const rocksdb_memtableinfo_t* info) { + return info->rep.num_deletes; +} + +/* event listener */ + +struct rocksdb_eventlistener_t : public EventListener { + void* state_{}; + void (*destructor_)(void*){}; + void (*on_flush_begin)(void*, rocksdb_t*, const rocksdb_flushjobinfo_t*){}; + void (*on_flush_completed)(void*, rocksdb_t*, + const rocksdb_flushjobinfo_t*){}; + void (*on_compaction_begin)(void*, rocksdb_t*, + const rocksdb_compactionjobinfo_t*){}; + void (*on_compaction_completed)(void*, rocksdb_t*, + const rocksdb_compactionjobinfo_t*){}; + void (*on_subcompaction_begin)(void*, + const rocksdb_subcompactionjobinfo_t*){}; + void (*on_subcompaction_completed)(void*, + const rocksdb_subcompactionjobinfo_t*){}; + void (*on_external_file_ingested)( + void*, rocksdb_t*, const rocksdb_externalfileingestioninfo_t*){}; + void (*on_background_error)(void*, uint32_t, rocksdb_status_ptr_t*){}; + void (*on_stall_conditions_changed)(void*, const rocksdb_writestallinfo_t*){}; + void (*on_memtable_sealed)(void*, const rocksdb_memtableinfo_t*){}; + + rocksdb_eventlistener_t() = default; + + rocksdb_eventlistener_t(const rocksdb_eventlistener_t&) = delete; + rocksdb_eventlistener_t& operator=(const rocksdb_eventlistener_t&) = delete; + rocksdb_eventlistener_t(rocksdb_eventlistener_t&&) = delete; + rocksdb_eventlistener_t& operator=(rocksdb_eventlistener_t&&) = delete; + + void OnFlushBegin(DB* db, const FlushJobInfo& info) override { + rocksdb_t c_db = {db}; + on_flush_begin(state_, &c_db, + reinterpret_cast(&info)); + } + + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + rocksdb_t c_db = {db}; + on_flush_completed(state_, &c_db, + reinterpret_cast(&info)); + } + + void OnCompactionBegin(DB* db, const CompactionJobInfo& info) override { + rocksdb_t c_db = {db}; + on_compaction_begin( + state_, &c_db, + reinterpret_cast(&info)); + } + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override { + rocksdb_t c_db = {db}; + on_compaction_completed( + state_, &c_db, + reinterpret_cast(&info)); + } + + void OnSubcompactionBegin(const SubcompactionJobInfo& info) override { + on_subcompaction_begin( + state_, reinterpret_cast(&info)); + } + + void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override { + on_subcompaction_completed( + state_, reinterpret_cast(&info)); + } + + void OnExternalFileIngested(DB* db, + const ExternalFileIngestionInfo& info) override { + rocksdb_t c_db = {db}; + on_external_file_ingested( + state_, &c_db, + reinterpret_cast(&info)); + } + + void OnBackgroundError(BackgroundErrorReason reason, + Status* status) override { + rocksdb_status_ptr_t* s = new rocksdb_status_ptr_t; + s->rep = status; + on_background_error(state_, static_cast(reason), s); + delete s; + } + + void OnStallConditionsChanged(const WriteStallInfo& info) override { + on_stall_conditions_changed( + state_, reinterpret_cast(&info)); + } + + void OnMemTableSealed(const MemTableInfo& info) override { + on_memtable_sealed(state_, + reinterpret_cast(&info)); + } + + ~rocksdb_eventlistener_t() override { destructor_(state_); } +}; + +rocksdb_eventlistener_t* rocksdb_eventlistener_create( + void* state_, void (*destructor_)(void*), on_flush_begin_cb on_flush_begin, + on_flush_completed_cb on_flush_completed, + on_compaction_begin_cb on_compaction_begin, + on_compaction_completed_cb on_compaction_completed, + on_subcompaction_begin_cb on_subcompaction_begin, + on_subcompaction_completed_cb on_subcompaction_completed, + on_external_file_ingested_cb on_external_file_ingested, + on_background_error_cb on_background_error, + on_stall_conditions_changed_cb on_stall_conditions_changed, + on_memtable_sealed_cb on_memtable_sealed) { + rocksdb_eventlistener_t* et = new rocksdb_eventlistener_t; + et->state_ = state_; + et->destructor_ = destructor_; + et->on_flush_begin = on_flush_begin; + et->on_flush_completed = on_flush_completed; + et->on_compaction_begin = on_compaction_begin; + et->on_compaction_completed = on_compaction_completed; + et->on_subcompaction_begin = on_subcompaction_begin; + et->on_subcompaction_completed = on_subcompaction_completed; + et->on_external_file_ingested = on_external_file_ingested; + et->on_background_error = on_background_error; + et->on_stall_conditions_changed = on_stall_conditions_changed; + et->on_memtable_sealed = on_memtable_sealed; + return et; +} + +void rocksdb_eventlistener_destroy(rocksdb_eventlistener_t* t) { delete t; } + +void rocksdb_options_add_eventlistener(rocksdb_options_t* opt, + rocksdb_eventlistener_t* t) { + opt->rep.listeners.emplace_back(std::shared_ptr(t)); +} + +void rocksdb_compaction_service_options_override_add_event_listener( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_eventlistener_t* event_listener) { + if (override_options && event_listener) { + override_options->rep.listeners.emplace_back( + std::shared_ptr(event_listener)); + } } rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() { @@ -3141,6 +4392,65 @@ rocksdb_logger_t* rocksdb_logger_create_callback_logger( void rocksdb_logger_destroy(rocksdb_logger_t* logger) { delete logger; } +/* File Checksum Gen Factory */ + +rocksdb_file_checksum_gen_factory_t* +rocksdb_file_checksum_gen_crc32c_factory_create() { + rocksdb_file_checksum_gen_factory_t* factory = + new rocksdb_file_checksum_gen_factory_t; + factory->rep = GetFileChecksumGenCrc32cFactory(); + return factory; +} + +void rocksdb_file_checksum_gen_factory_destroy( + rocksdb_file_checksum_gen_factory_t* factory) { + delete factory; +} + +void rocksdb_options_set_file_checksum_gen_factory( + rocksdb_options_t* opt, rocksdb_file_checksum_gen_factory_t* factory) { + if (opt && factory) { + opt->rep.file_checksum_gen_factory = factory->rep; + } +} + +/* SST Partitioner Factory */ + +rocksdb_sst_partitioner_factory_t* +rocksdb_sst_partitioner_fixed_prefix_factory_create(size_t prefix_len) { + rocksdb_sst_partitioner_factory_t* factory = + new rocksdb_sst_partitioner_factory_t; + factory->rep = NewSstPartitionerFixedPrefixFactory(prefix_len); + return factory; +} + +void rocksdb_sst_partitioner_factory_destroy( + rocksdb_sst_partitioner_factory_t* factory) { + delete factory; +} + +void rocksdb_options_set_sst_partitioner_factory( + rocksdb_options_t* opt, rocksdb_sst_partitioner_factory_t* factory) { + if (opt && factory) { + opt->rep.sst_partitioner_factory = factory->rep; + } +} + +/* Table Properties Collector Factory */ + +void rocksdb_table_properties_collector_factory_destroy( + rocksdb_table_properties_collector_factory_t* factory) { + delete factory; +} + +void rocksdb_options_add_table_properties_collector_factory( + rocksdb_options_t* opt, + rocksdb_table_properties_collector_factory_t* factory) { + if (opt && factory) { + opt->rep.table_properties_collector_factories.push_back(factory->rep); + } +} + void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { opt->rep.env = (env ? env->rep : nullptr); } @@ -3183,6 +4493,11 @@ void rocksdb_options_set_write_buffer_manager( opt->rep.write_buffer_manager = wbm->rep; } +void rocksdb_options_set_sst_file_manager(rocksdb_options_t* opt, + rocksdb_sst_file_manager_t* sfm) { + opt->rep.sst_file_manager = sfm->rep; +} + size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) { return opt->rep.write_buffer_size; } @@ -3295,6 +4610,26 @@ uint64_t rocksdb_options_get_periodic_compaction_seconds( return opt->rep.periodic_compaction_seconds; } +void rocksdb_options_set_memtable_op_scan_flush_trigger(rocksdb_options_t* opt, + uint32_t n) { + opt->rep.memtable_op_scan_flush_trigger = n; +} + +uint32_t rocksdb_options_get_memtable_op_scan_flush_trigger( + rocksdb_options_t* opt) { + return opt->rep.memtable_op_scan_flush_trigger; +} + +void rocksdb_options_set_memtable_avg_op_scan_flush_trigger( + rocksdb_options_t* opt, uint32_t n) { + opt->rep.memtable_avg_op_scan_flush_trigger = n; +} + +uint32_t rocksdb_options_get_memtable_avg_op_scan_flush_trigger( + rocksdb_options_t* opt) { + return opt->rep.memtable_avg_op_scan_flush_trigger; +} + void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); } @@ -3332,16 +4667,6 @@ unsigned char rocksdb_options_get_skip_stats_update_on_db_open( return opt->rep.skip_stats_update_on_db_open; } -void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( - rocksdb_options_t* opt, unsigned char val) { - opt->rep.skip_checking_sst_file_sizes_on_db_open = val; -} - -unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( - rocksdb_options_t* opt) { - return opt->rep.skip_checking_sst_file_sizes_on_db_open; -} - /* Blob Options Settings */ void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt, unsigned char val) { @@ -3804,16 +5129,6 @@ int rocksdb_options_get_min_write_buffer_number_to_merge( return opt->rep.min_write_buffer_number_to_merge; } -void rocksdb_options_set_max_write_buffer_number_to_maintain( - rocksdb_options_t* opt, int n) { - opt->rep.max_write_buffer_number_to_maintain = n; -} - -int rocksdb_options_get_max_write_buffer_number_to_maintain( - rocksdb_options_t* opt) { - return opt->rep.max_write_buffer_number_to_maintain; -} - void rocksdb_options_set_max_write_buffer_size_to_maintain( rocksdb_options_t* opt, int64_t n) { opt->rep.max_write_buffer_size_to_maintain = n; @@ -4280,6 +5595,15 @@ void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio( opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); } +void rocksdb_options_add_compact_on_deletion_collector_factory_min_file_size( + rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger, + double deletion_ratio, uint64_t min_file_size) { + std::shared_ptr + compact_on_del = NewCompactOnDeletionCollectorFactory( + window_size, num_dels_trigger, deletion_ratio, min_file_size); + opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); +} + void rocksdb_set_perf_level(int v) { PerfLevel level = static_cast(v); SetPerfLevel(level); @@ -4332,6 +5656,8 @@ uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, return rep->internal_recent_skipped_count; case rocksdb_internal_merge_count: return rep->internal_merge_count; + case rocksdb_internal_merge_point_lookup_count: + return rep->internal_merge_point_lookup_count; case rocksdb_get_snapshot_time: return rep->get_snapshot_time; case rocksdb_get_from_memtable_time: @@ -4756,11 +6082,6 @@ unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) { return opt->rep.tailing; } -void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt, - unsigned char v) { - opt->rep.managed = v; -} - void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt, size_t v) { opt->rep.readahead_size = v; @@ -5239,6 +6560,67 @@ ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall( wbm->rep->SetAllowStall(new_allow_stall); } +rocksdb_sst_file_manager_t* rocksdb_sst_file_manager_create( + rocksdb_env_t* env) { + rocksdb_sst_file_manager_t* sfm = new rocksdb_sst_file_manager_t; + sfm->rep.reset(ROCKSDB_NAMESPACE::NewSstFileManager(env->rep)); + return sfm; +} + +void rocksdb_sst_file_manager_destroy(rocksdb_sst_file_manager_t* sfm) { + delete sfm; +} + +void rocksdb_sst_file_manager_set_max_allowed_space_usage( + rocksdb_sst_file_manager_t* sfm, uint64_t max_allowed_space) { + sfm->rep->SetMaxAllowedSpaceUsage(max_allowed_space); +} + +void rocksdb_sst_file_manager_set_compaction_buffer_size( + rocksdb_sst_file_manager_t* sfm, uint64_t compaction_buffer_size) { + sfm->rep->SetCompactionBufferSize(compaction_buffer_size); +} + +bool rocksdb_sst_file_manager_is_max_allowed_space_reached( + rocksdb_sst_file_manager_t* sfm) { + return sfm->rep->IsMaxAllowedSpaceReached(); +} + +bool rocksdb_sst_file_manager_is_max_allowed_space_reached_including_compactions( + rocksdb_sst_file_manager_t* sfm) { + return sfm->rep->IsMaxAllowedSpaceReachedIncludingCompactions(); +} + +uint64_t rocksdb_sst_file_manager_get_total_size( + rocksdb_sst_file_manager_t* sfm) { + return sfm->rep->GetTotalSize(); +} + +int64_t rocksdb_sst_file_manager_get_delete_rate_bytes_per_second( + rocksdb_sst_file_manager_t* sfm) { + return sfm->rep->GetDeleteRateBytesPerSecond(); +} + +void rocksdb_sst_file_manager_set_delete_rate_bytes_per_second( + rocksdb_sst_file_manager_t* sfm, int64_t delete_rate) { + return sfm->rep->SetDeleteRateBytesPerSecond(delete_rate); +} + +double rocksdb_sst_file_manager_get_max_trash_db_ratio( + rocksdb_sst_file_manager_t* sfm) { + return sfm->rep->GetMaxTrashDBRatio(); +} + +void rocksdb_sst_file_manager_set_max_trash_db_ratio( + rocksdb_sst_file_manager_t* sfm, double ratio) { + return sfm->rep->SetMaxTrashDBRatio(ratio); +} + +uint64_t rocksdb_sst_file_manager_get_total_trash_size( + rocksdb_sst_file_manager_t* sfm) { + return sfm->rep->GetTotalTrashSize(); +} + rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size) { rocksdb_dbpath_t* result = new rocksdb_dbpath_t; @@ -5500,14 +6882,12 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create( char* (*transform)(void*, const char* key, size_t length, size_t* dst_length), unsigned char (*in_domain)(void*, const char* key, size_t length), - unsigned char (*in_range)(void*, const char* key, size_t length), const char* (*name)(void*)) { rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t; result->state_ = state; result->destructor_ = destructor; result->transform_ = transform; result->in_domain_ = in_domain; - result->in_range_ = in_range; result->name_ = name; return result; } @@ -5523,7 +6903,6 @@ struct SliceTransformWrapper : public rocksdb_slicetransform_t { return rep_->Transform(src); } bool InDomain(const Slice& src) const override { return rep_->InDomain(src); } - bool InRange(const Slice& src) const override { return rep_->InRange(src); } static void DoNothing(void*) {} }; @@ -5647,6 +7026,27 @@ uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size( return fifo_opts->rep.max_table_files_size; } +void rocksdb_fifo_compaction_options_set_max_data_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) { + fifo_opts->rep.max_data_files_size = size; +} + +uint64_t rocksdb_fifo_compaction_options_get_max_data_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts) { + return fifo_opts->rep.max_data_files_size; +} + +void rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts, + unsigned char use_kv_ratio_compaction) { + fifo_opts->rep.use_kv_ratio_compaction = use_kv_ratio_compaction; +} + +unsigned char rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts) { + return fifo_opts->rep.use_kv_ratio_compaction; +} + void rocksdb_fifo_compaction_options_destroy( rocksdb_fifo_compaction_options_t* fifo_opts) { delete fifo_opts; @@ -5666,6 +7066,10 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, } } +rocksdb_livefiles_t* rocksdb_livefiles_create() { + return new rocksdb_livefiles_t; +} + int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) { return static_cast(lf->rep.size()); } @@ -5679,6 +7083,16 @@ const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].name.c_str(); } +const char* rocksdb_livefiles_directory(const rocksdb_livefiles_t* lf, + int index) { + if (lf->rep[index].directory.empty()) { + // db_path is deprecated but still returned by some code paths + return lf->rep[index].db_path.c_str(); + } else { + return lf->rep[index].directory.c_str(); + } +} + int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].level; } @@ -5699,6 +7113,16 @@ const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf, return lf->rep[index].largestkey.data(); } +uint64_t rocksdb_livefiles_smallest_seqno(const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].smallest_seqno; +} + +uint64_t rocksdb_livefiles_largest_seqno(const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].largest_seqno; +} + uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].num_entries; } @@ -5709,6 +7133,71 @@ uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) { void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) { delete lf; } +rocksdb_livefile_t* rocksdb_livefile_create() { return new rocksdb_livefile_t; } + +void rocksdb_livefile_set_column_family_name(rocksdb_livefile_t* lf, + const char* column_family_name) { + lf->rep.column_family_name = std::string(column_family_name); +} + +void rocksdb_livefile_set_level(rocksdb_livefile_t* lf, int level) { + lf->rep.level = level; +} + +void rocksdb_livefile_set_name(rocksdb_livefile_t* lf, const char* name) { + lf->rep.name = std::string(name); +} + +void rocksdb_livefile_set_directory(rocksdb_livefile_t* lf, + const char* directory) { + lf->rep.directory = std::string(directory); + lf->rep.db_path = std::string(directory); // deprecated but still needed +} + +void rocksdb_livefile_set_size(rocksdb_livefile_t* lf, size_t size) { + lf->rep.size = size; +} + +void rocksdb_livefile_set_smallest_key(rocksdb_livefile_t* lf, + const char* smallest_key, + size_t smallest_key_len) { + lf->rep.smallestkey = std::string(smallest_key, smallest_key_len); +} + +void rocksdb_livefile_set_largest_key(rocksdb_livefile_t* lf, + const char* largest_key, + size_t largest_key_len) { + lf->rep.largestkey = std::string(largest_key, largest_key_len); +} + +void rocksdb_livefile_set_smallest_seqno(rocksdb_livefile_t* lf, + uint64_t smallest_seqno) { + lf->rep.smallest_seqno = smallest_seqno; +} + +void rocksdb_livefile_set_largest_seqno(rocksdb_livefile_t* lf, + uint64_t largest_seqno) { + lf->rep.largest_seqno = largest_seqno; +} + +void rocksdb_livefile_set_num_entries(rocksdb_livefile_t* lf, + uint64_t num_entries) { + lf->rep.num_entries = num_entries; +} + +void rocksdb_livefile_set_num_deletions(rocksdb_livefile_t* lf, + uint64_t num_deletions) { + lf->rep.num_deletions = num_deletions; +} + +void rocksdb_livefile_destroy(rocksdb_livefile_t* lf) { delete lf; } + +void rocksdb_livefiles_add(rocksdb_livefiles_t* lf, + rocksdb_livefile_t* livefile) { + lf->rep.push_back(std::move(livefile->rep)); + delete livefile; +} + void rocksdb_get_options_from_string(const rocksdb_options_t* base_options, const char* opts_str, rocksdb_options_t* new_options, @@ -5859,6 +7348,58 @@ char* rocksdb_sst_file_metadata_get_largestkey( return CopyString(file_meta->rep->largestkey); } +rocksdb_import_column_family_options_t* +rocksdb_import_column_family_options_create() { + return new rocksdb_import_column_family_options_t; +} + +void rocksdb_import_column_family_options_set_move_files( + rocksdb_import_column_family_options_t* opt, unsigned char v) { + opt->rep.move_files = v; +} + +void rocksdb_import_column_family_options_destroy( + rocksdb_import_column_family_options_t* metadata) { + delete metadata; +} + +rocksdb_export_import_files_metadata_t* +rocksdb_export_import_files_metadata_create() { + auto metadata = new rocksdb_export_import_files_metadata_t; + metadata->rep = new ExportImportFilesMetaData; + return metadata; +} + +char* rocksdb_export_import_files_metadata_get_db_comparator_name( + rocksdb_export_import_files_metadata_t* metadata) { + return strdup(metadata->rep->db_comparator_name.c_str()); +} + +void rocksdb_export_import_files_metadata_set_db_comparator_name( + rocksdb_export_import_files_metadata_t* metadata, const char* name) { + metadata->rep->db_comparator_name = std::string(name); +} + +rocksdb_livefiles_t* rocksdb_export_import_files_metadata_get_files( + rocksdb_export_import_files_metadata_t* export_import_metadata) { + auto files = new rocksdb_livefiles_t; + files->rep = std::vector(export_import_metadata->rep->files); + return files; +} + +void rocksdb_export_import_files_metadata_set_files( + rocksdb_export_import_files_metadata_t* metadata, + rocksdb_livefiles_t* files) { + metadata->rep->files = std::move(files->rep); + delete files; +} + +void rocksdb_export_import_files_metadata_destroy( + rocksdb_export_import_files_metadata_t* metadata) { + delete metadata->rep; + delete metadata; +} + /* Transactions */ rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() { @@ -5890,6 +7431,11 @@ void rocksdb_transactiondb_options_set_default_lock_timeout( opt->rep.default_lock_timeout = default_lock_timeout; } +void rocksdb_transactiondb_options_set_use_per_key_point_lock_mgr( + rocksdb_transactiondb_options_t* opt, int use_per_key_point_lock_mgr) { + opt->rep.use_per_key_point_lock_mgr = use_per_key_point_lock_mgr; +} + rocksdb_transaction_options_t* rocksdb_transaction_options_create() { return new rocksdb_transaction_options_t; } @@ -6186,11 +7732,11 @@ char* rocksdb_transaction_get(rocksdb_transaction_t* txn, const char* key, size_t klen, size_t* vlen, char** errptr) { char* result = nullptr; - std::string tmp; - Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp); + PinnableSlice pinnable_val; + Status s = txn->rep->Get(options->rep, Slice(key, klen), &pinnable_val); if (s.ok()) { - *vlen = tmp.size(); - result = CopyString(tmp); + *vlen = pinnable_val.size(); + result = CopyString(pinnable_val); } else { *vlen = 0; if (!s.IsNotFound()) { @@ -6221,12 +7767,12 @@ char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn, const char* key, size_t klen, size_t* vlen, char** errptr) { char* result = nullptr; - std::string tmp; - Status s = - txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp); + PinnableSlice pinnable_val; + Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), + &pinnable_val); if (s.ok()) { - *vlen = tmp.size(); - result = CopyString(tmp); + *vlen = pinnable_val.size(); + result = CopyString(pinnable_val); } else { *vlen = 0; if (!s.IsNotFound()) { @@ -6260,12 +7806,12 @@ char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn, size_t* vlen, unsigned char exclusive, char** errptr) { char* result = nullptr; - std::string tmp; - Status s = - txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive); + PinnableSlice pinnable_val; + Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen), + &pinnable_val, exclusive); if (s.ok()) { - *vlen = tmp.size(); - result = CopyString(tmp); + *vlen = pinnable_val.size(); + result = CopyString(pinnable_val); } else { *vlen = 0; if (!s.IsNotFound()) { @@ -6297,12 +7843,12 @@ char* rocksdb_transaction_get_for_update_cf( rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, size_t* vlen, unsigned char exclusive, char** errptr) { char* result = nullptr; - std::string tmp; + PinnableSlice pinnable_val; Status s = txn->rep->GetForUpdate(options->rep, column_family->rep, - Slice(key, klen), &tmp, exclusive); + Slice(key, klen), &pinnable_val, exclusive); if (s.ok()) { - *vlen = tmp.size(); - result = CopyString(tmp); + *vlen = pinnable_val.size(); + result = CopyString(pinnable_val); } else { *vlen = 0; if (!s.IsNotFound()) { @@ -6336,10 +7882,13 @@ void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn, const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs) { - std::vector keys(num_keys); + // Use unique_ptr for efficiency + std::unique_ptr keys_arr(new Slice[num_keys]); for (size_t i = 0; i < num_keys; i++) { - keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]); } + // Note: Transaction only has vector-based MultiGet API + std::vector keys(keys_arr.get(), keys_arr.get() + num_keys); std::vector values(num_keys); std::vector statuses = txn->rep->MultiGet(options->rep, keys, &values); @@ -6365,10 +7914,14 @@ void rocksdb_transaction_multi_get_for_update( size_t num_keys, const char* const* keys_list, const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs) { - std::vector keys(num_keys); + // Use unique_ptr for efficiency + std::unique_ptr keys_arr(new Slice[num_keys]); for (size_t i = 0; i < num_keys; i++) { - keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]); } + // Note: GetForUpdate only has vector-based API, no array-based PinnableSlice + // variant + std::vector keys(keys_arr.get(), keys_arr.get() + num_keys); std::vector values(num_keys); std::vector statuses = txn->rep->MultiGetForUpdate(options->rep, keys, &values); @@ -6395,12 +7948,15 @@ void rocksdb_transaction_multi_get_cf( size_t num_keys, const char* const* keys_list, const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs) { - std::vector keys(num_keys); + // Use unique_ptr for efficiency + std::unique_ptr keys_arr(new Slice[num_keys]); std::vector cfs(num_keys); for (size_t i = 0; i < num_keys; i++) { - keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]); cfs[i] = column_families[i]->rep; } + // Note: Transaction only has vector-based MultiGet API + std::vector keys(keys_arr.get(), keys_arr.get() + num_keys); std::vector values(num_keys); std::vector statuses = txn->rep->MultiGet(options->rep, cfs, keys, &values); @@ -6427,12 +7983,16 @@ void rocksdb_transaction_multi_get_for_update_cf( size_t num_keys, const char* const* keys_list, const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs) { - std::vector keys(num_keys); + // Use unique_ptr for efficiency + std::unique_ptr keys_arr(new Slice[num_keys]); std::vector cfs(num_keys); for (size_t i = 0; i < num_keys; i++) { - keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]); cfs[i] = column_families[i]->rep; } + // Note: GetForUpdate only has vector-based API, no array-based PinnableSlice + // variant + std::vector keys(keys_arr.get(), keys_arr.get() + num_keys); std::vector values(num_keys); std::vector statuses = txn->rep->MultiGetForUpdate(options->rep, cfs, keys, &values); @@ -6459,11 +8019,12 @@ char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db, const char* key, size_t klen, size_t* vlen, char** errptr) { char* result = nullptr; - std::string tmp; - Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp); + PinnableSlice pinnable_val; + Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(), + Slice(key, klen), &pinnable_val); if (s.ok()) { - *vlen = tmp.size(); - result = CopyString(tmp); + *vlen = pinnable_val.size(); + result = CopyString(pinnable_val); } else { *vlen = 0; if (!s.IsNotFound()) { @@ -6494,12 +8055,12 @@ char* rocksdb_transactiondb_get_cf( rocksdb_column_family_handle_t* column_family, const char* key, size_t keylen, size_t* vallen, char** errptr) { char* result = nullptr; - std::string tmp; + PinnableSlice pinnable_val; Status s = txn_db->rep->Get(options->rep, column_family->rep, - Slice(key, keylen), &tmp); + Slice(key, keylen), &pinnable_val); if (s.ok()) { - *vallen = tmp.size(); - result = CopyString(tmp); + *vallen = pinnable_val.size(); + result = CopyString(pinnable_val); } else { *vallen = 0; if (!s.IsNotFound()) { @@ -6533,13 +8094,17 @@ void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db, const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs) { - std::vector keys(num_keys); + // Use unique_ptr for efficiency + std::unique_ptr keys(new Slice[num_keys]); for (size_t i = 0; i < num_keys; i++) { keys[i] = Slice(keys_list[i], keys_list_sizes[i]); } - std::vector values(num_keys); - std::vector statuses = - txn_db->rep->MultiGet(options->rep, keys, &values); + // Use PinnableSlice to avoid unnecessary allocations + auto cfh = txn_db->rep->DefaultColumnFamily(); + std::vector values(num_keys); + std::vector statuses(num_keys); + txn_db->rep->MultiGet(options->rep, cfh, num_keys, keys.get(), values.data(), + statuses.data()); for (size_t i = 0; i < num_keys; i++) { if (statuses[i].ok()) { values_list[i] = CopyString(values[i]); @@ -6563,15 +8128,18 @@ void rocksdb_transactiondb_multi_get_cf( size_t num_keys, const char* const* keys_list, const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs) { - std::vector keys(num_keys); - std::vector cfs(num_keys); + // Use unique_ptr for efficiency + std::unique_ptr keys(new Slice[num_keys]); + std::unique_ptr cfs(new ColumnFamilyHandle*[num_keys]); for (size_t i = 0; i < num_keys; i++) { keys[i] = Slice(keys_list[i], keys_list_sizes[i]); cfs[i] = column_families[i]->rep; } - std::vector values(num_keys); - std::vector statuses = - txn_db->rep->MultiGet(options->rep, cfs, keys, &values); + // Use PinnableSlice to avoid unnecessary allocations + std::vector values(num_keys); + std::vector statuses(num_keys); + txn_db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(), + values.data(), statuses.data()); for (size_t i = 0; i < num_keys; i++) { if (statuses[i].ok()) { values_list[i] = CopyString(values[i]); @@ -6975,7 +8543,7 @@ rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create( dbs.push_back(db->rep); } - unordered_set cache_set; + std::unordered_set cache_set; for (auto cache : consumers->caches) { cache_set.insert(const_cast(cache->rep.get())); } @@ -7054,6 +8622,14 @@ void rocksdb_enable_manual_compaction(rocksdb_t* db) { db->rep->EnableManualCompaction(); } +void rocksdb_abort_all_compactions(rocksdb_t* db) { + db->rep->AbortAllCompactions(); +} + +void rocksdb_resume_all_compactions(rocksdb_t* db) { + db->rep->ResumeAllCompactions(); +} + rocksdb_statistics_histogram_data_t* rocksdb_statistics_histogram_data_create() { return new rocksdb_statistics_histogram_data_t{}; @@ -7164,4 +8740,110 @@ uint64_t rocksdb_wait_for_compact_options_get_timeout( return opt->rep.timeout.count(); } +/* High-performance zero-copy Get implementations */ + +struct rocksdb_pinnable_handle_t { + PinnableSlice rep; +}; + +rocksdb_pinnable_handle_t* rocksdb_get_pinned_v2( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnable_handle_t* handle = new rocksdb_pinnable_handle_t; + Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(), + Slice(key, keylen), &handle->rep); + if (!s.ok()) { + delete handle; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return handle; +} + +rocksdb_pinnable_handle_t* rocksdb_get_pinned_cf_v2( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnable_handle_t* handle = new rocksdb_pinnable_handle_t; + Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), + &handle->rep); + if (!s.ok()) { + delete handle; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return handle; +} + +const char* rocksdb_pinnable_handle_get_value( + const rocksdb_pinnable_handle_t* handle, size_t* vallen) { + if (!handle) { + *vallen = 0; + return nullptr; + } + *vallen = handle->rep.size(); + return handle->rep.data(); +} + +void rocksdb_pinnable_handle_destroy(rocksdb_pinnable_handle_t* handle) { + delete handle; +} + +unsigned char rocksdb_get_into_buffer(rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + char* buffer, size_t buffer_size, + size_t* vallen, unsigned char* found, + char** errptr) { + PinnableSlice pinnable_val; + Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(), + Slice(key, keylen), &pinnable_val); + if (s.ok()) { + *found = 1; + *vallen = pinnable_val.size(); + if (buffer_size >= pinnable_val.size()) { + memcpy(buffer, pinnable_val.data(), pinnable_val.size()); + return 1; // Success - data copied + } + return 0; // Buffer too small + } else { + *found = 0; + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return 0; + } +} + +unsigned char rocksdb_get_into_buffer_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char* buffer, size_t buffer_size, size_t* vallen, + unsigned char* found, char** errptr) { + PinnableSlice pinnable_val; + Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), + &pinnable_val); + if (s.ok()) { + *found = 1; + *vallen = pinnable_val.size(); + if (buffer_size >= pinnable_val.size()) { + memcpy(buffer, pinnable_val.data(), pinnable_val.size()); + return 1; // Success - data copied + } + return 0; // Buffer too small + } else { + *found = 0; + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return 0; + } +} + } // end extern "C" diff --git a/db/c_test.c b/db/c_test.c index 18bf2961ded3..8c57d0fcf6ec 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -103,6 +103,12 @@ static void CheckValue(char* err, const char* expected, char** actual, Free(actual); } +static void CheckPinnedValue(char* err, const char* expected, + const char** actual, size_t actual_length) { + CheckNoError(err); + CheckEqual(expected, *actual, actual_length); +} + static void CheckGet(rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, const char* expected) { char* err = NULL; @@ -716,6 +722,88 @@ static void LoadAndCheckLatestOptions(const char* db_name, rocksdb_env_t* env, num_column_families); } +// Global state for tracking remote compaction calls +typedef struct { + int schedule_called; + int wait_called; + int cancel_called; + char last_scheduled_job_id[256]; + char last_db_name[256]; +} RemoteCompactionState; + +// Schedule callback - gets called when compaction is scheduled +static rocksdb_compactionservice_scheduleresponse_t* RemoteCompactionSchedule( + void* state, const rocksdb_compactionservice_jobinfo_t* info, + const char* input, size_t input_len) { + (void)input; + (void)input_len; + RemoteCompactionState* rcs = (RemoteCompactionState*)state; + rcs->schedule_called++; + + // Extract job info + size_t db_name_len; + const char* db_name = + rocksdb_compactionservice_jobinfo_t_get_db_name(info, &db_name_len); + memcpy(rcs->last_db_name, db_name, db_name_len); + rcs->last_db_name[db_name_len] = '\0'; + + // Generate a job ID + snprintf(rcs->last_scheduled_job_id, sizeof(rcs->last_scheduled_job_id), + "job-%d", rcs->schedule_called); + + // Create response with success status + char* err = NULL; + rocksdb_compactionservice_scheduleresponse_t* response = + rocksdb_compactionservice_scheduleresponse_create( + rcs->last_scheduled_job_id, + rocksdb_compactionservice_jobstatus_success, &err); + if (err) { + free(err); + } + return response; +} + +// Wait callback - simulates waiting for remote compaction to complete +static int RemoteCompactionWait(void* state, const char* scheduled_job_id, + char** result, size_t* result_len) { + RemoteCompactionState* rcs = (RemoteCompactionState*)state; + rcs->wait_called++; + + if (strcmp(scheduled_job_id, rcs->last_scheduled_job_id) != 0) { + return rocksdb_compactionservice_jobstatus_failure; + } + + // For testing purposes, return kUseLocal to cause RocksDB to fall back to + // local compaction. This tests the callback mechanism without needing a fully + // serialized result. In a real scenario, this would communicate with a remote + // worker that calls rocksdb_open_and_compact() and returns a properly + // serialized CompactionServiceResult + *result = NULL; + *result_len = 0; + + return rocksdb_compactionservice_jobstatus_use_local; +} + +// Cancel callback - cancels pending jobs +static void RemoteCompactionCancel(void* state) { + RemoteCompactionState* rcs = (RemoteCompactionState*)state; + rcs->cancel_called++; +} + +// Destructor callback +static void RemoteCompactionDestroy(void* state) { (void)state; } + +// NULL schedule callback for testing failure handling +static rocksdb_compactionservice_scheduleresponse_t* NullSchedule( + void* state, const rocksdb_compactionservice_jobinfo_t* info, + const char* input, size_t input_len) { + (void)state; + (void)info; + (void)input; + (void)input_len; + return NULL; // Return NULL to simulate failure +} + int main(int argc, char** argv) { (void)argc; (void)argv; @@ -1030,6 +1118,78 @@ int main(int argc, char** argv) { rocksdb_options_set_error_if_exists(options, 1); } + StartPhase("checkpoint_export_column_family"); + { + static char cf_export_path[200]; + static char db_import_path[200]; + snprintf(cf_export_path, sizeof(cf_export_path), + "%s/rocksdb_c_test-%d-cf_export", GetTempDir(), ((int)geteuid())); + snprintf(db_import_path, sizeof(db_import_path), + "%s/rocksdb_c_test-%d-db_import", GetTempDir(), ((int)geteuid())); + + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_column_family_handle_t* cf_export = + rocksdb_create_column_family(db, db_options, "cf_export", &err); + CheckNoError(err); + + rocksdb_put_cf(db, woptions, cf_export, "k1", 2, "v1", 2, &err); + CheckNoError(err); + rocksdb_put_cf(db, woptions, cf_export, "k2", 2, "v2", 2, &err); + CheckNoError(err); + + rocksdb_checkpoint_t* checkpoint = + rocksdb_checkpoint_object_create(db, &err); + CheckNoError(err); + + rocksdb_export_import_files_metadata_t* export_metadata = + rocksdb_checkpoint_export_column_family(checkpoint, cf_export, + cf_export_path, &err); + CheckNoError(err); + const char* comparator_name = + rocksdb_export_import_files_metadata_get_db_comparator_name( + export_metadata); + CheckEqual("leveldb.BytewiseComparator", comparator_name, 26); + rocksdb_free((void*)comparator_name); + rocksdb_checkpoint_object_destroy(checkpoint); + checkpoint = NULL; + rocksdb_drop_column_family(db, cf_export, &err); + CheckNoError(err); + rocksdb_column_family_handle_destroy(cf_export); + rocksdb_options_set_create_if_missing(db_options, 1); + rocksdb_options_set_error_if_exists(db_options, 1); + rocksdb_t* db_import = rocksdb_open(db_options, db_import_path, &err); + CheckNoError(err); + rocksdb_import_column_family_options_t* import_options = + rocksdb_import_column_family_options_create(); + rocksdb_column_family_handle_t* cf_import = + rocksdb_create_column_family_with_import(db_import, db_options, + "cf_import", import_options, + export_metadata, &err); + CheckNoError(err); + rocksdb_import_column_family_options_destroy(import_options); + rocksdb_export_import_files_metadata_destroy(export_metadata); + size_t val_len; + char* val = + rocksdb_get_cf(db_import, roptions, cf_import, "k1", 2, &val_len, &err); + CheckNoError(err); + CheckEqual("v1", val, val_len); + free(val); + + val = + rocksdb_get_cf(db_import, roptions, cf_import, "k2", 2, &val_len, &err); + CheckNoError(err); + CheckEqual("v2", val, val_len); + free(val); + + rocksdb_column_family_handle_destroy(cf_import); + cf_import = NULL; + rocksdb_close(db_import); + rocksdb_destroy_db(db_options, db_import_path, &err); + CheckNoError(err); + rocksdb_options_destroy(db_options); + db_options = NULL; + } + StartPhase("compactall"); rocksdb_compact_range(db, NULL, 0, NULL, 0); CheckGet(db, roptions, "foo", "hello"); @@ -1177,6 +1337,70 @@ int main(int argc, char** argv) { rocksdb_writebatch_destroy(wb); } + StartPhase("writebatch_vectors_cf"); + { + const char* cf_name = "wb_vectors_cf"; + rocksdb_column_family_handle_t* wb_cf = + rocksdb_create_column_family(db, options, cf_name, &err); + CheckNoError(err); + + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + + // Test putv_cf: concatenates multiple slices into a single key/value + const char* put_keys[2] = {"k", "ey"}; + const size_t put_key_sizes[2] = {1, 2}; + const char* put_vals[3] = {"v", "a", "l"}; + const size_t put_val_sizes[3] = {1, 1, 1}; + rocksdb_writebatch_putv_cf(wb, wb_cf, 2, put_keys, put_key_sizes, 3, + put_vals, put_val_sizes); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + // putv_cf concatenates: key="k"+"ey"="key", value="v"+"a"+"l"="val" + CheckGetCF(db, roptions, wb_cf, "key", "val"); + CheckGetCF(db, roptions, wb_cf, "k", NULL); + CheckGetCF(db, roptions, wb_cf, "ey", NULL); + + // Test deletev_cf: concatenates multiple slices for key + rocksdb_writebatch_clear(wb); + const char* del_keys[2] = {"k", "ey"}; + const size_t del_key_sizes[2] = {1, 2}; + rocksdb_writebatch_deletev_cf(wb, wb_cf, 2, del_keys, del_key_sizes); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGetCF(db, roptions, wb_cf, "key", NULL); + + // Test delete_rangev_cf: concatenates slices for range deletion + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put_cf(wb, wb_cf, "a", 1, "1", 1); + rocksdb_writebatch_put_cf(wb, wb_cf, "b", 1, "2", 1); + rocksdb_writebatch_put_cf(wb, wb_cf, "c", 1, "3", 1); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGetCF(db, roptions, wb_cf, "a", "1"); + CheckGetCF(db, roptions, wb_cf, "b", "2"); + CheckGetCF(db, roptions, wb_cf, "c", "3"); + + rocksdb_writebatch_clear(wb); + const char* range_start[2] = {"a", ""}; // "a" + "" = "a" + const size_t range_start_sizes[2] = {1, 0}; + const char* range_end[2] = {"c", ""}; + const size_t range_end_sizes[2] = {1, 0}; + rocksdb_writebatch_delete_rangev_cf(wb, wb_cf, 2, range_start, + range_start_sizes, range_end, + range_end_sizes); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + // Range [a, c) should delete "a" and "b", but not "c" + CheckGetCF(db, roptions, wb_cf, "a", NULL); + CheckGetCF(db, roptions, wb_cf, "b", NULL); + CheckGetCF(db, roptions, wb_cf, "c", "3"); + + rocksdb_writebatch_destroy(wb); + rocksdb_drop_column_family(db, wb_cf, &err); + CheckNoError(err); + rocksdb_column_family_handle_destroy(wb_cf); + } + StartPhase("writebatch_vectors"); { rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); @@ -1245,6 +1469,8 @@ int main(int argc, char** argv) { CheckCondition(count == 3); size_t size; char* value; + const char* pinned_value; + rocksdb_pinnableslice_t* p; value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size, &err); CheckValue(err, "c", &value, size); @@ -1254,9 +1480,19 @@ int main(int argc, char** argv) { value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "foo", 3, &size, &err); CheckValue(err, "hello", &value, size); + p = rocksdb_writebatch_wi_get_pinned_from_batch_and_db(wbi, db, roptions, + "foo", 3, &err); + pinned_value = rocksdb_pinnableslice_value(p, &size); + CheckPinnedValue(err, "hello", &pinned_value, size); + rocksdb_pinnableslice_destroy(p); value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "box", 3, &size, &err); CheckValue(err, "c", &value, size); + p = rocksdb_writebatch_wi_get_pinned_from_batch_and_db(wbi, db, roptions, + "box", 3, &err); + pinned_value = rocksdb_pinnableslice_value(p, &size); + CheckPinnedValue(err, "c", &pinned_value, size); + rocksdb_pinnableslice_destroy(p); rocksdb_write_writebatch_wi(db, woptions, wbi, &err); CheckNoError(err); CheckGet(db, roptions, "foo", "hello"); @@ -1330,6 +1566,43 @@ int main(int argc, char** argv) { rocksdb_iter_destroy(iter); } + StartPhase("iter_slice"); + { + // Test the new slice-based iterator API for better performance + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + + // Test rocksdb_iter_key_slice + rocksdb_slice_t key_slice = rocksdb_iter_key_slice(iter); + CheckEqual("box", key_slice.data, key_slice.size); + + // Test rocksdb_iter_value_slice + rocksdb_slice_t value_slice = rocksdb_iter_value_slice(iter); + CheckEqual("c", value_slice.data, value_slice.size); + + // Move to next entry and test again + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + key_slice = rocksdb_iter_key_slice(iter); + value_slice = rocksdb_iter_value_slice(iter); + CheckEqual("foo", key_slice.data, key_slice.size); + CheckEqual("hello", value_slice.data, value_slice.size); + + // Test seeking with slice API + rocksdb_iter_seek(iter, "b", 1); + CheckCondition(rocksdb_iter_valid(iter)); + key_slice = rocksdb_iter_key_slice(iter); + value_slice = rocksdb_iter_value_slice(iter); + CheckEqual("box", key_slice.data, key_slice.size); + CheckEqual("c", value_slice.data, value_slice.size); + + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + } + StartPhase("wbwi_iter"); { rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions); @@ -1362,6 +1635,46 @@ int main(int argc, char** argv) { rocksdb_writebatch_wi_destroy(wbi); } + StartPhase("wbwi_iter_readoptions"); + { + rocksdb_readoptions_t* iter_roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_iterate_lower_bound(iter_roptions, "boy", 3); + rocksdb_readoptions_set_iterate_upper_bound(iter_roptions, "fool", 4); + rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, iter_roptions); + rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1); + rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", + 1); // should get filtered out + rocksdb_writebatch_wi_put(wbi, "cat", 3, "miau", 4); + rocksdb_writebatch_wi_put(wbi, "gnu", 3, "muh", + 3); // should get filtered out + rocksdb_iterator_t* iter = + rocksdb_writebatch_wi_create_iterator_with_base_readopts(wbi, base_iter, + iter_roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "cat", "miau"); + rocksdb_iter_next(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_prev(iter); + CheckIter(iter, "cat", "miau"); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_seek(iter, "b", 1); + CheckIter(iter, "cat", "miau"); + rocksdb_iter_seek_for_prev(iter, "d", 1); + CheckIter(iter, "cat", "miau"); + rocksdb_iter_seek_for_prev(iter, "fool", 3); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + rocksdb_writebatch_wi_destroy(wbi); + rocksdb_readoptions_destroy(iter_roptions); + } + StartPhase("multiget"); { const char* keys[3] = {"box", "foo", "notfound"}; @@ -1375,6 +1688,53 @@ int main(int argc, char** argv) { CheckMultiGetValues(3, vals, vals_sizes, errs, expected); } + StartPhase("zero_copy_get_pinned_v2"); + { + // Test new zero-copy get functions + + // Test rocksdb_get_pinned_v2 + rocksdb_pinnable_handle_t* handle = + rocksdb_get_pinned_v2(db, roptions, "foo", 3, &err); + CheckNoError(err); + CheckCondition(handle != NULL); + size_t val_len; + const char* val = rocksdb_pinnable_handle_get_value(handle, &val_len); + CheckEqual("hello", val, val_len); + rocksdb_pinnable_handle_destroy(handle); + + // Test with non-existent key + handle = rocksdb_get_pinned_v2(db, roptions, "notfound", 8, &err); + CheckNoError(err); + CheckCondition(handle == NULL); + + // Test rocksdb_get_into_buffer + char buffer[100]; + unsigned char found; + unsigned char success = rocksdb_get_into_buffer( + db, roptions, "foo", 3, buffer, sizeof(buffer), &val_len, &found, &err); + CheckNoError(err); + CheckCondition(success == 1); + CheckCondition(found == 1); + CheckCondition(val_len == 5); + CheckCondition(memcmp(buffer, "hello", 5) == 0); + + // Test with buffer too small + success = rocksdb_get_into_buffer(db, roptions, "foo", 3, buffer, + 2, // Buffer too small + &val_len, &found, &err); + CheckNoError(err); + CheckCondition(success == 0); // Should fail due to small buffer + CheckCondition(found == 1); + CheckCondition(val_len == 5); // Should still report actual size + + // Test with non-existent key + success = rocksdb_get_into_buffer(db, roptions, "notfound", 8, buffer, + sizeof(buffer), &val_len, &found, &err); + CheckNoError(err); + CheckCondition(success == 0); + CheckCondition(found == 0); + } + StartPhase("pin_get"); { CheckPinGet(db, roptions, "box", "c"); @@ -1792,6 +2152,84 @@ int main(int argc, char** argv) { rocksdb_flush_wal(db, 1, &err); CheckNoError(err); + // Test column family handle get name + { + size_t name_len; + char* cf_name = + rocksdb_column_family_handle_get_name(handles[1], &name_len); + CheckCondition(name_len == 3); + CheckCondition(memcmp(cf_name, "cf1", 3) == 0); + rocksdb_free(cf_name); + } + + // Test zero-copy get with column families + { + rocksdb_pinnable_handle_t* handle = + rocksdb_get_pinned_cf_v2(db, roptions, handles[1], "box", 3, &err); + CheckNoError(err); + CheckCondition(handle != NULL); + size_t val_len; + const char* val = rocksdb_pinnable_handle_get_value(handle, &val_len); + CheckEqual("c", val, val_len); + rocksdb_pinnable_handle_destroy(handle); + + // Test with non-existent key + handle = rocksdb_get_pinned_cf_v2(db, roptions, handles[1], "notfound", 8, + &err); + CheckNoError(err); + CheckCondition(handle == NULL); + + // Test rocksdb_get_into_buffer_cf + char buffer[100]; + unsigned char found; + unsigned char success = rocksdb_get_into_buffer_cf( + db, roptions, handles[1], "buff", 4, buffer, sizeof(buffer), &val_len, + &found, &err); + CheckNoError(err); + CheckCondition(success == 1); + CheckCondition(found == 1); + CheckCondition(val_len == 7); + CheckCondition(memcmp(buffer, "rocksdb", 7) == 0); + + // Test with buffer too small + success = rocksdb_get_into_buffer_cf(db, roptions, handles[1], "buff", 4, + buffer, 3, // Buffer too small + &val_len, &found, &err); + CheckNoError(err); + CheckCondition(success == 0); // Should fail due to small buffer + CheckCondition(found == 1); + CheckCondition(val_len == 7); // Should still report actual size + } + + // Test WriteBatchWithIndex iteration with Column Family + rocksdb_writebatch_wi_t* wbwi = rocksdb_writebatch_wi_create(0, true); + rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "boat", 4, "row", + 3); // should be filtered out + rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "buffy", 5, "charmed", 7); + rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "bus", 3, "yellow", + 6); // should be filtered out + rocksdb_readoptions_t* iter_roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_iterate_lower_bound(iter_roptions, "bu", 2); + rocksdb_readoptions_set_iterate_upper_bound(iter_roptions, "buffz", 5); + rocksdb_iterator_t* base_iter = + rocksdb_create_iterator_cf(db, iter_roptions, handles[1]); + rocksdb_iterator_t* wbwi_iter = + rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts( + wbwi, base_iter, handles[1], iter_roptions); + + CheckCondition(!rocksdb_iter_valid(wbwi_iter)); + rocksdb_iter_seek_to_first(wbwi_iter); + CheckCondition(rocksdb_iter_valid(wbwi_iter)); + CheckIter(wbwi_iter, "buff", "rocksdb"); + rocksdb_iter_next(wbwi_iter); + CheckIter(wbwi_iter, "buffy", "charmed"); + rocksdb_iter_next(wbwi_iter); + CheckCondition(!rocksdb_iter_valid(wbwi_iter)); + + rocksdb_iter_destroy(wbwi_iter); + rocksdb_writebatch_wi_destroy(wbwi); + rocksdb_readoptions_destroy(iter_roptions); + const char* keys[3] = {"box", "box", "barfooxx"}; const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1]}; @@ -1839,6 +2277,74 @@ int main(int argc, char** argv) { } } + { + // Test rocksdb_batched_multi_get_cf_slice for better performance + // Build rocksdb_slice_t array directly to avoid conversion overhead + rocksdb_slice_t batched_key_slices[4]; + batched_key_slices[0].data = "box"; + batched_key_slices[0].size = 3; + batched_key_slices[1].data = "buff"; + batched_key_slices[1].size = 4; + batched_key_slices[2].data = "barfooxx"; + batched_key_slices[2].size = 8; + batched_key_slices[3].data = "box"; + batched_key_slices[3].size = 3; + + const char* expected_value[4] = {"c", "rocksdb", NULL, "c"}; + char* batched_errs[4]; + rocksdb_pinnableslice_t* pvals[4]; + + rocksdb_batched_multi_get_cf_slice(db, roptions, handles[1], 4, + batched_key_slices, pvals, + batched_errs, false); + + const char* val; + size_t val_len; + for (i = 0; i < 4; ++i) { + CheckNoError(batched_errs[i]); + if (pvals[i] != NULL) { + val = rocksdb_pinnableslice_value(pvals[i], &val_len); + CheckEqual(expected_value[i], val, val_len); + rocksdb_pinnableslice_destroy(pvals[i]); + } else { + CheckEqual(expected_value[i], NULL, 0); + } + } + } + + { + // Test rocksdb_batched_multi_get_cf_slice with sorted_input=true + // Keys must be in sorted order for this optimization + rocksdb_slice_t sorted_key_slices[3]; + sorted_key_slices[0].data = "box"; + sorted_key_slices[0].size = 3; + sorted_key_slices[1].data = "buff"; + sorted_key_slices[1].size = 4; + sorted_key_slices[2].data = "notfound"; + sorted_key_slices[2].size = 8; + + const char* expected_value[3] = {"c", "rocksdb", NULL}; + char* batched_errs[3]; + rocksdb_pinnableslice_t* pvals[3]; + + rocksdb_batched_multi_get_cf_slice(db, roptions, handles[1], 3, + sorted_key_slices, pvals, batched_errs, + true); + + const char* val; + size_t val_len; + for (i = 0; i < 3; ++i) { + CheckNoError(batched_errs[i]); + if (pvals[i] != NULL) { + val = rocksdb_pinnableslice_value(pvals[i], &val_len); + CheckEqual(expected_value[i], val, val_len); + rocksdb_pinnableslice_destroy(pvals[i]); + } else { + CheckEqual(expected_value[i], NULL, 0); + } + } + } + { unsigned char value_found = 0; @@ -2129,16 +2635,20 @@ int main(int argc, char** argv) { CheckCondition(100000 == rocksdb_options_get_periodic_compaction_seconds(o)); + rocksdb_options_set_memtable_op_scan_flush_trigger(o, 100); + CheckCondition(100 == + rocksdb_options_get_memtable_op_scan_flush_trigger(o)); + + rocksdb_options_set_memtable_avg_op_scan_flush_trigger(o, 150); + CheckCondition(150 == + rocksdb_options_get_memtable_avg_op_scan_flush_trigger(o)); + rocksdb_options_set_ttl(o, 5000); CheckCondition(5000 == rocksdb_options_get_ttl(o)); rocksdb_options_set_skip_stats_update_on_db_open(o, 1); CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); - rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1); - CheckCondition( - 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o)); - rocksdb_options_set_max_write_buffer_number(o, 97); CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o)); @@ -2146,10 +2656,6 @@ int main(int argc, char** argv) { CheckCondition(23 == rocksdb_options_get_min_write_buffer_number_to_merge(o)); - rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64); - CheckCondition(64 == - rocksdb_options_get_max_write_buffer_number_to_maintain(o)); - rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000); CheckCondition(50000 == rocksdb_options_get_max_write_buffer_size_to_maintain(o)); @@ -2402,13 +2908,9 @@ int main(int argc, char** argv) { CheckCondition(2.0 == rocksdb_options_get_max_bytes_for_level_multiplier(copy)); CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy)); - CheckCondition( - 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy)); CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy)); CheckCondition(23 == rocksdb_options_get_min_write_buffer_number_to_merge(copy)); - CheckCondition( - 64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy)); CheckCondition(50000 == rocksdb_options_get_max_write_buffer_size_to_maintain(copy)); CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy)); @@ -2572,6 +3074,18 @@ int main(int argc, char** argv) { CheckCondition(100000 == rocksdb_options_get_periodic_compaction_seconds(o)); + rocksdb_options_set_memtable_op_scan_flush_trigger(copy, 800); + CheckCondition(800 == + rocksdb_options_get_memtable_op_scan_flush_trigger(copy)); + CheckCondition(100 == + rocksdb_options_get_memtable_op_scan_flush_trigger(o)); + + rocksdb_options_set_memtable_avg_op_scan_flush_trigger(copy, 900); + CheckCondition( + 900 == rocksdb_options_get_memtable_avg_op_scan_flush_trigger(copy)); + CheckCondition(150 == + rocksdb_options_get_memtable_avg_op_scan_flush_trigger(o)); + rocksdb_options_set_ttl(copy, 8000); CheckCondition(8000 == rocksdb_options_get_ttl(copy)); CheckCondition(5000 == rocksdb_options_get_ttl(o)); @@ -2580,12 +3094,6 @@ int main(int argc, char** argv) { CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy)); CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); - rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0); - CheckCondition( - 0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy)); - CheckCondition( - 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o)); - rocksdb_options_set_max_write_buffer_number(copy, 2000); CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy)); CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o)); @@ -2596,12 +3104,6 @@ int main(int argc, char** argv) { CheckCondition(23 == rocksdb_options_get_min_write_buffer_number_to_merge(o)); - rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128); - CheckCondition( - 128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy)); - CheckCondition(64 == - rocksdb_options_get_max_write_buffer_number_to_maintain(o)); - rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000); CheckCondition(9000 == rocksdb_options_get_max_write_buffer_size_to_maintain(copy)); @@ -3094,6 +3596,14 @@ int main(int argc, char** argv) { 100000 == rocksdb_fifo_compaction_options_get_max_table_files_size(fco)); + rocksdb_fifo_compaction_options_set_max_data_files_size(fco, 200000); + CheckCondition( + 200000 == rocksdb_fifo_compaction_options_get_max_data_files_size(fco)); + + rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction(fco, 1); + CheckCondition( + 1 == rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction(fco)); + rocksdb_fifo_compaction_options_destroy(fco); } @@ -3314,6 +3824,17 @@ int main(int argc, char** argv) { rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err); CheckNoError(err); + // test transaction get/set name (before commit) + { + rocksdb_transaction_set_name(txn, "test_txn", 8, &err); + CheckNoError(err); + size_t name_len; + char* txn_name = rocksdb_transaction_get_name(txn, &name_len); + CheckCondition(name_len == 8); + CheckCondition(memcmp(txn_name, "test_txn", 8) == 0); + rocksdb_free(txn_name); + } + // read from outside transaction, before commit CheckTxnDBGet(txn_db, roptions, "foo", NULL); CheckTxnDBPinGet(txn_db, roptions, "foo", NULL); @@ -3934,7 +4455,7 @@ int main(int argc, char** argv) { StartPhase("statistics"); { - const uint32_t BYTES_WRITTEN_TICKER = 60; + const uint32_t BYTES_WRITTEN_TICKER = 61; const uint32_t DB_WRITE_HIST = 1; rocksdb_statistics_histogram_data_t* hist = @@ -4052,6 +4573,313 @@ int main(int argc, char** argv) { rocksdb_cache_destroy(lru); } + StartPhase("remote_compaction_service"); + { + RemoteCompactionState remote_state = {0, 0, 0, "", ""}; + + // Create compaction service + rocksdb_compactionservice_t* service = rocksdb_compactionservice_create( + &remote_state, // state + RemoteCompactionDestroy, // destructor + RemoteCompactionSchedule, // schedule callback + "TestRemoteCompaction", // name + RemoteCompactionWait, // wait callback + RemoteCompactionCancel, // cancel_awaiting_jobs + NULL); // on_installation + + // Create options with remote compaction + rocksdb_options_t* remote_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(remote_options, 1); + rocksdb_options_set_level0_file_num_compaction_trigger(remote_options, 2); + rocksdb_options_set_write_buffer_size(remote_options, + 64 * 1024); // 64KB buffer + rocksdb_options_set_max_bytes_for_level_base(remote_options, + 256 * 1024); // 256KB + rocksdb_options_set_target_file_size_base( + remote_options, 64 * 1024); // 64KB target file size + // Disable automatic compactions to test manual compaction only + rocksdb_options_set_disable_auto_compactions(remote_options, 1); + rocksdb_options_set_compaction_service(remote_options, service); + + // Destroy old DB and create new one + rocksdb_close(db); + rocksdb_destroy_db(remote_options, dbname, &err); + CheckNoError(err); + + db = rocksdb_open(remote_options, dbname, &err); + CheckNoError(err); + + // Create multiple SST files to trigger compaction + rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_opts, 1); + + // Write and flush multiple times to create multiple L0 files + // Write more data with larger values to ensure files are substantial + for (int batch = 0; batch < 5; batch++) { + for (int i = 0; i < 200; i++) { + char key[20], val[1000]; + snprintf(key, sizeof(key), "key%d_%d", batch, i); + // Fill value with repeated data to make it larger + memset(val, 'a' + (batch % 26), sizeof(val) - 1); + val[sizeof(val) - 1] = '\0'; + rocksdb_put(db, woptions, key, strlen(key), val, strlen(val), &err); + CheckNoError(err); + } + rocksdb_flush(db, flush_opts, &err); + CheckNoError(err); + } + rocksdb_flushoptions_destroy(flush_opts); + + // Trigger manual compaction to invoke remote compaction service + rocksdb_compact_range(db, NULL, 0, NULL, 0); + + rocksdb_wait_for_compact_options_t* wco = + rocksdb_wait_for_compact_options_create(); + rocksdb_wait_for_compact(db, wco, &err); + CheckNoError(err); + rocksdb_wait_for_compact_options_destroy(wco); + + // Verify that callbacks were actually called + CheckCondition(remote_state.schedule_called > 0); + CheckCondition(remote_state.wait_called > 0); + CheckCondition(strlen(remote_state.last_db_name) > 0); + CheckCondition(strstr(remote_state.last_db_name, "rocksdb_c_test") != NULL); + + // Verify data is still accessible after remote compaction + // Just check a few keys to verify data integrity + for (int batch = 0; batch < 5; batch++) { + char key[20]; + snprintf(key, sizeof(key), "key%d_0", batch); + size_t vallen; + char* val = rocksdb_get(db, roptions, key, strlen(key), &vallen, &err); + CheckNoError(err); + CheckCondition(val != NULL); + CheckCondition(vallen == 999); // strlen of 1000-byte string + free(val); + } + + // Test cancellation API directly + RemoteCompactionCancel(&remote_state); + CheckCondition(remote_state.cancel_called > 0); + + // Cleanup + rocksdb_close(db); + rocksdb_destroy_db(remote_options, dbname, &err); + CheckNoError(err); + rocksdb_options_destroy(remote_options); + + // Reopen DB with original options for subsequent tests + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + } + + StartPhase("remote_compaction_scheduleresponse"); + { + // Test scheduleresponse creation and getters + rocksdb_compactionservice_scheduleresponse_t* response; + + // Test success response + err = NULL; + response = rocksdb_compactionservice_scheduleresponse_create( + "test-job-123", rocksdb_compactionservice_jobstatus_success, &err); + CheckNoError(err); + CheckCondition(response != NULL); + CheckCondition( + rocksdb_compactionservice_scheduleresponse_getstatus(response) == + rocksdb_compactionservice_jobstatus_success); + + size_t job_id_len; + const char* job_id = + rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id( + response, &job_id_len); + CheckCondition(job_id_len == strlen("test-job-123")); + CheckCondition(memcmp(job_id, "test-job-123", job_id_len) == 0); + rocksdb_compactionservice_scheduleresponse_t_destroy(response); + + // Test failure response + response = rocksdb_compactionservice_scheduleresponse_create_with_status( + rocksdb_compactionservice_jobstatus_failure, &err); + CheckCondition(response != NULL); + CheckCondition( + rocksdb_compactionservice_scheduleresponse_getstatus(response) == + rocksdb_compactionservice_jobstatus_failure); + rocksdb_compactionservice_scheduleresponse_t_destroy(response); + + response = rocksdb_compactionservice_scheduleresponse_create_with_status( + 999, &err); + CheckCondition(response == NULL); // Invalid status + if (err) { + Free(&err); + } + } + + StartPhase("remote_compaction_options_override"); + { + // Test CompactionServiceOptionsOverride API + rocksdb_compaction_service_options_override_t* override_opts = + rocksdb_compaction_service_options_override_create(); + CheckCondition(override_opts != NULL); + + // Set up override options + rocksdb_compaction_service_options_override_set_env(override_opts, env); + rocksdb_compaction_service_options_override_set_comparator(override_opts, + cmp); + + // Test file checksum gen factory + rocksdb_file_checksum_gen_factory_t* checksum_factory = + rocksdb_file_checksum_gen_crc32c_factory_create(); + CheckCondition(checksum_factory != NULL); + rocksdb_compaction_service_options_override_set_file_checksum_gen_factory( + override_opts, checksum_factory); + + // Test SST partitioner factory + rocksdb_sst_partitioner_factory_t* partitioner_factory = + rocksdb_sst_partitioner_fixed_prefix_factory_create(4); + CheckCondition(partitioner_factory != NULL); + rocksdb_compaction_service_options_override_set_sst_partitioner_factory( + override_opts, partitioner_factory); + + // Test merge operator + rocksdb_compaction_service_options_override_set_merge_operator( + override_opts, NULL); + + // Test compaction filter + rocksdb_compaction_service_options_override_set_compaction_filter( + override_opts, NULL); + + // Test prefix extractor + rocksdb_compaction_service_options_override_set_prefix_extractor( + override_opts, NULL); + + // Test table factory - block based + rocksdb_block_based_table_options_t* table_opts = + rocksdb_block_based_options_create(); + rocksdb_compaction_service_options_override_set_block_based_table_factory( + override_opts, table_opts); + rocksdb_block_based_options_destroy(table_opts); + + // Test statistics via options + rocksdb_options_t* stats_opts = rocksdb_options_create(); + rocksdb_options_enable_statistics(stats_opts); + rocksdb_compaction_service_options_override_set_statistics(override_opts, + stats_opts); + rocksdb_options_destroy(stats_opts); + + // Test info log + rocksdb_logger_t* logger = + rocksdb_logger_create_stderr_logger(1, "test_prefix"); + rocksdb_compaction_service_options_override_set_info_log(override_opts, + logger); + rocksdb_logger_destroy(logger); + + // Test options map + rocksdb_compaction_service_options_override_set_option( + override_opts, "max_bytes_for_level_base", "67108864"); + + // Cleanup + rocksdb_file_checksum_gen_factory_destroy(checksum_factory); + rocksdb_sst_partitioner_factory_destroy(partitioner_factory); + rocksdb_compaction_service_options_override_destroy(override_opts); + } + + StartPhase("factory_options_on_regular_options"); + { + // Test that the new factory types work with regular rocksdb_options_t + rocksdb_options_t* test_opts = rocksdb_options_create(); + + // Test file checksum gen factory on regular options + rocksdb_file_checksum_gen_factory_t* checksum_factory = + rocksdb_file_checksum_gen_crc32c_factory_create(); + CheckCondition(checksum_factory != NULL); + rocksdb_options_set_file_checksum_gen_factory(test_opts, checksum_factory); + + // Test SST partitioner factory on regular options + rocksdb_sst_partitioner_factory_t* partitioner_factory = + rocksdb_sst_partitioner_fixed_prefix_factory_create(8); + CheckCondition(partitioner_factory != NULL); + rocksdb_options_set_sst_partitioner_factory(test_opts, partitioner_factory); + + // Cleanup + rocksdb_file_checksum_gen_factory_destroy(checksum_factory); + rocksdb_sst_partitioner_factory_destroy(partitioner_factory); + rocksdb_options_destroy(test_opts); + } + + StartPhase("remote_compaction_null_callback_handling"); + { + // Test that NULL callback returns are handled gracefully + // This simulates a failure in the remote compaction service + rocksdb_compactionservice_t* null_service = + rocksdb_compactionservice_create(NULL, NULL, NullSchedule, + "NullTestService", NULL, NULL, NULL); + + rocksdb_options_t* null_opts = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(null_opts, 1); + rocksdb_options_set_compaction_service(null_opts, null_service); + + const char* null_db = "rocksdb_c_test_null_service"; + + rocksdb_t* null_db_handle = rocksdb_open(null_opts, null_db, &err); + CheckNoError(err); + + // Write data and trigger compaction + for (int i = 0; i < 100; i++) { + char key[20], val[50]; + snprintf(key, sizeof(key), "key%d", i); + snprintf(val, sizeof(val), "val%d", i); + rocksdb_put(null_db_handle, woptions, key, strlen(key), val, strlen(val), + &err); + CheckNoError(err); + } + + // This should fall back to local compaction (not crash) + rocksdb_compact_range(null_db_handle, NULL, 0, NULL, 0); + + // Data should still be readable + CheckGet(null_db_handle, roptions, "key50", "val50"); + + rocksdb_close(null_db_handle); + rocksdb_destroy_db(null_opts, null_db, &err); + rocksdb_options_destroy(null_opts); + } + + StartPhase("remote_compaction_canceled_flag"); + { + // Test atomic cancellation flag API + unsigned char* canceled = rocksdb_open_and_compact_canceled_create(); + CheckCondition(canceled != NULL); + + // Set cancellation + rocksdb_open_and_compact_canceled_set(canceled, 1); + + // Use with OpenAndCompactOptions + rocksdb_open_and_compact_options_t* oac_opts = + rocksdb_open_and_compact_options_create(); + rocksdb_open_and_compact_options_set_canceled(oac_opts, canceled); + rocksdb_open_and_compact_options_set_allow_resumption(oac_opts, 1); + + // Cleanup + rocksdb_open_and_compact_options_destroy(oac_opts); + rocksdb_open_and_compact_canceled_destroy(canceled); + } + + StartPhase("sst_file_manager"); + { + rocksdb_sst_file_manager_t* sst_file_manager; + sst_file_manager = rocksdb_sst_file_manager_create(env); + rocksdb_sst_file_manager_set_delete_rate_bytes_per_second(sst_file_manager, + 1); + rocksdb_sst_file_manager_set_max_trash_db_ratio(sst_file_manager, 0.75); + + CheckCondition(1 == + rocksdb_sst_file_manager_get_delete_rate_bytes_per_second( + sst_file_manager)); + CheckCondition(0.75 == rocksdb_sst_file_manager_get_max_trash_db_ratio( + sst_file_manager)); + + rocksdb_sst_file_manager_destroy(sst_file_manager); + } + StartPhase("cancel_all_background_work"); rocksdb_cancel_all_background_work(db, 1); diff --git a/db/column_family.cc b/db/column_family.cc index 2c1ad930ab01..bbf9f8210b31 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -110,23 +110,48 @@ void GetInternalTblPropCollFactory( } } +Status CheckCompressionSupportedWithManager( + CompressionType type, UnownedPtr mgr) { + if (mgr) { + if (!mgr->SupportsCompressionType(type)) { + return Status::NotSupported("Compression type " + + CompressionTypeToString(type) + + " is not recognized/supported by this " + "version of CompressionManager " + + mgr->GetId()); + } + } else { + if (!CompressionTypeSupported(type)) { + if (type <= kLastBuiltinCompression) { + return Status::InvalidArgument("Compression type " + + CompressionTypeToString(type) + + " is not linked with the binary."); + } else { + return Status::NotSupported( + "Compression type " + CompressionTypeToString(type) + + " is not recognized/supported by built-in CompressionManager."); + } + } + } + return Status::OK(); +} + Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) { if (!cf_options.compression_per_level.empty()) { for (size_t level = 0; level < cf_options.compression_per_level.size(); ++level) { - if (!CompressionTypeSupported(cf_options.compression_per_level[level])) { - return Status::InvalidArgument( - "Compression type " + - CompressionTypeToString(cf_options.compression_per_level[level]) + - " is not linked with the binary."); + Status s = CheckCompressionSupportedWithManager( + cf_options.compression_per_level[level], + cf_options.compression_manager.get()); + if (!s.ok()) { + return s; } } } else { - if (!CompressionTypeSupported(cf_options.compression)) { - return Status::InvalidArgument( - "Compression type " + - CompressionTypeToString(cf_options.compression) + - " is not linked with the binary."); + Status s = CheckCompressionSupportedWithManager( + cf_options.compression, cf_options.compression_manager.get()); + if (!s.ok()) { + return s; } } if (cf_options.compression_opts.zstd_max_train_bytes > 0) { @@ -168,7 +193,8 @@ Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) { } if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) { return Status::InvalidArgument( - "Memtable doesn't allow concurrent writes (allow_concurrent_memtable_write)"); + "Memtable doesn't allow concurrent writes " + "(allow_concurrent_memtable_write)"); } return Status::OK(); } @@ -199,8 +225,9 @@ const uint64_t kDefaultTtl = 0xfffffffffffffffe; const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe; } // anonymous namespace -ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, - const ColumnFamilyOptions& src) { +ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options, + bool read_only, + const ColumnFamilyOptions& src) { ColumnFamilyOptions result = src; size_t clamp_max = std::conditional< sizeof(size_t) == 4, std::integral_constant, @@ -239,6 +266,10 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.min_write_buffer_number_to_merge = 1; } + if (result.disallow_memtable_writes) { + // A simple memtable that enforces MarkReadOnly (unlike skip list) + result.memtable_factory = std::make_shared(); + } if (result.num_levels < 1) { result.num_levels = 1; @@ -249,22 +280,18 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, } if (result.compaction_style == kCompactionStyleUniversal && - db_options.allow_ingest_behind && result.num_levels < 3) { + (db_options.allow_ingest_behind || result.cf_allow_ingest_behind) && + result.num_levels < 3) { result.num_levels = 3; } if (result.max_write_buffer_number < 2) { result.max_write_buffer_number = 2; } - // fall back max_write_buffer_number_to_maintain if - // max_write_buffer_size_to_maintain is not set if (result.max_write_buffer_size_to_maintain < 0) { result.max_write_buffer_size_to_maintain = result.max_write_buffer_number * static_cast(result.write_buffer_size); - } else if (result.max_write_buffer_size_to_maintain == 0 && - result.max_write_buffer_number_to_maintain < 0) { - result.max_write_buffer_number_to_maintain = result.max_write_buffer_number; } // bloom filter size shouldn't exceed 1/4 of memtable size. if (result.memtable_prefix_bloom_size_ratio > 0.25) { @@ -374,7 +401,13 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, } if (result.max_compaction_bytes == 0) { - result.max_compaction_bytes = result.target_file_size_base * 25; + // For FIFO with use_kv_ratio_compaction, leave max_compaction_bytes as 0 + // to signal "auto-calculate target from capacity and SST/blob ratio." + // When explicitly set by the user, it overrides the auto-calculated target. + if (result.compaction_style != kCompactionStyleFIFO || + !result.compaction_options_fifo.use_kv_ratio_compaction) { + result.max_compaction_bytes = result.target_file_size_base * 25; + } } bool is_block_based_table = (result.table_factory->IsInstanceOf( @@ -435,6 +468,33 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.periodic_compaction_seconds = 0; } + if (read_only && (result.preserve_internal_time_seconds > 0 || + result.preclude_last_level_data_seconds > 0)) { + // With no writes coming in, we don't need periodic SeqnoToTime entries. + // Existing SST files may or may not have that info associated with them. + ROCKS_LOG_WARN( + db_options.info_log.get(), + "preserve_internal_time_seconds and preclude_last_level_data_seconds " + "are ignored in read-only DB"); + result.preserve_internal_time_seconds = 0; + result.preclude_last_level_data_seconds = 0; + } + + if (read_only) { + if (result.memtable_op_scan_flush_trigger) { + ROCKS_LOG_WARN(db_options.info_log.get(), + "option memtable_op_scan_flush_trigger is sanitized to " + "0(disabled) for read only DB."); + result.memtable_op_scan_flush_trigger = 0; + } + if (result.memtable_avg_op_scan_flush_trigger) { + ROCKS_LOG_WARN( + db_options.info_log.get(), + "option memtable_avg_op_scan_flush_trigger is sanitized to " + "0(disabled) for read only DB."); + result.memtable_avg_op_scan_flush_trigger = 0; + } + } return result; } @@ -492,6 +552,17 @@ void SuperVersion::Init( imm->Ref(); current->Ref(); refs.store(1, std::memory_order_relaxed); + + // There should be at least one mapping entry iff time tracking is enabled. +#ifndef NDEBUG + MinAndMaxPreserveSeconds preserve_info{mutable_cf_options}; + if (preserve_info.IsEnabled()) { + assert(seqno_to_time_mapping); + assert(!seqno_to_time_mapping->Empty()); + } else { + assert(seqno_to_time_mapping == nullptr); + } +#endif // NDEBUG } namespace { @@ -530,7 +601,7 @@ ColumnFamilyData::ColumnFamilyData( const FileOptions* file_options, ColumnFamilySet* column_family_set, BlockCacheTracer* const block_cache_tracer, const std::shared_ptr& io_tracer, const std::string& db_id, - const std::string& db_session_id) + const std::string& db_session_id, bool read_only) : id_(id), name_(name), dummy_versions_(_dummy_versions), @@ -540,7 +611,7 @@ ColumnFamilyData::ColumnFamilyData( dropped_(false), flush_skip_reschedule_(false), internal_comparator_(cf_options.comparator), - initial_cf_options_(SanitizeOptions(db_options, cf_options)), + initial_cf_options_(SanitizeCfOptions(db_options, read_only, cf_options)), ioptions_(db_options, initial_cf_options_), mutable_cf_options_(initial_cf_options_), is_delete_range_supported_( @@ -548,7 +619,6 @@ ColumnFamilyData::ColumnFamilyData( write_buffer_manager_(write_buffer_manager), mem_(nullptr), imm_(ioptions_.min_write_buffer_number_to_merge, - ioptions_.max_write_buffer_number_to_maintain, ioptions_.max_write_buffer_size_to_maintain), super_version_(nullptr), super_version_number_(0), @@ -1179,10 +1249,12 @@ Compaction* ColumnFamilyData::PickCompaction( const MutableCFOptions& mutable_options, const MutableDBOptions& mutable_db_options, const std::vector& existing_snapshots, - const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer) { + const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + bool require_max_output_level) { auto* result = compaction_picker_->PickCompaction( GetName(), mutable_options, mutable_db_options, existing_snapshots, - snapshot_checker, current_->storage_info(), log_buffer); + snapshot_checker, current_->storage_info(), log_buffer, + GetFullHistoryTsLow(), require_max_output_level); if (result != nullptr) { result->FinalizeInputInfo(current_); } @@ -1266,11 +1338,11 @@ Compaction* ColumnFamilyData::CompactRange( const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end, bool* conflict, uint64_t max_file_num_to_ignore, const std::string& trim_ts) { - auto* result = compaction_picker_->CompactRange( + auto* result = compaction_picker_->PickCompactionForCompactRange( GetName(), mutable_cf_options, mutable_db_options, current_->storage_info(), input_level, output_level, compact_range_options, begin, end, compaction_end, conflict, - max_file_num_to_ignore, trim_ts); + max_file_num_to_ignore, trim_ts, GetFullHistoryTsLow()); if (result != nullptr) { result->FinalizeInputInfo(current_); } @@ -1339,20 +1411,17 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { return false; } -void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context, - InstrumentedMutex* db_mutex) { +void ColumnFamilyData::InstallSuperVersion( + SuperVersionContext* sv_context, InstrumentedMutex* db_mutex, + std::optional> + new_seqno_to_time_mapping) { db_mutex->AssertHeld(); - return InstallSuperVersion(sv_context, mutable_cf_options_); -} -void ColumnFamilyData::InstallSuperVersion( - SuperVersionContext* sv_context, - const MutableCFOptions& mutable_cf_options) { SuperVersion* new_superversion = sv_context->new_superversion.release(); - new_superversion->mutable_cf_options = mutable_cf_options; + new_superversion->mutable_cf_options = GetLatestMutableCFOptions(); new_superversion->Init(this, mem_, imm_.current(), current_, - sv_context->new_seqno_to_time_mapping - ? std::move(sv_context->new_seqno_to_time_mapping) + new_seqno_to_time_mapping.has_value() + ? std::move(new_seqno_to_time_mapping.value()) : super_version_ ? super_version_->ShareSeqnoToTimeMapping() : nullptr); @@ -1365,7 +1434,7 @@ void ColumnFamilyData::InstallSuperVersion( // currently RecalculateWriteStallConditions() treats it as further slowing // down is needed. super_version_->write_stall_condition = - RecalculateWriteStallConditions(mutable_cf_options); + RecalculateWriteStallConditions(new_superversion->mutable_cf_options); } else { super_version_->write_stall_condition = old_superversion->write_stall_condition; @@ -1378,8 +1447,9 @@ void ColumnFamilyData::InstallSuperVersion( ResetThreadLocalSuperVersions(); if (old_superversion->mutable_cf_options.write_buffer_size != - mutable_cf_options.write_buffer_size) { - mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size); + new_superversion->mutable_cf_options.write_buffer_size) { + mem_->UpdateWriteBufferSize( + new_superversion->mutable_cf_options.write_buffer_size); } if (old_superversion->write_stall_condition != new_superversion->write_stall_condition) { @@ -1499,6 +1569,34 @@ Status ColumnFamilyData::ValidateOptions( "FIFO compaction only supported with max_open_files = -1."); } + if (cf_options.compaction_options_fifo.use_kv_ratio_compaction) { + if (cf_options.compaction_style != kCompactionStyleFIFO) { + return Status::InvalidArgument( + "use_kv_ratio_compaction is only supported with FIFO compaction " + "style."); + } + if (!cf_options.compaction_options_fifo.allow_compaction) { + return Status::InvalidArgument( + "use_kv_ratio_compaction requires allow_compaction = true. " + "allow_compaction enables intra-L0 compaction, and " + "use_kv_ratio_compaction selects the picking strategy."); + } + if (cf_options.compaction_options_fifo.max_data_files_size == 0) { + return Status::InvalidArgument( + "use_kv_ratio_compaction requires max_data_files_size > 0 to " + "compute the target compacted file size from data capacity."); + } + } + + if (cf_options.compaction_options_fifo.max_data_files_size > 0 && + cf_options.compaction_options_fifo.max_data_files_size < + cf_options.compaction_options_fifo.max_table_files_size) { + return Status::InvalidArgument( + "max_data_files_size (total data = SST + blob) must be >= " + "max_table_files_size (SST only) when non-zero, since total data " + "always includes SST data."); + } + std::vector supported{0, 1, 2, 4, 8}; if (std::find(supported.begin(), supported.end(), cf_options.memtable_protection_bytes_per_key) == @@ -1570,6 +1668,8 @@ Status ColumnFamilyData::SetOptions( Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map, &cf_opts); if (s.ok()) { + // FIXME: we should call SanitizeOptions() too or consolidate it with + // ValidateOptions(). s = ValidateOptions(db_opts, cf_opts); } if (s.ok()) { @@ -1680,7 +1780,8 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname, dummy_cfd_(new ColumnFamilyData( ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr, - block_cache_tracer, io_tracer, db_id, db_session_id)), + block_cache_tracer, io_tracer, db_id, db_session_id, + /*read_only*/ true)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), @@ -1752,12 +1853,12 @@ size_t ColumnFamilySet::NumberOfColumnFamilies() const { // under a DB mutex AND write thread ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( const std::string& name, uint32_t id, Version* dummy_versions, - const ColumnFamilyOptions& options) { + const ColumnFamilyOptions& options, bool read_only) { assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData( id, name, dummy_versions, table_cache_, write_buffer_manager_, options, *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_, - db_id_, db_session_id_); + db_id_, db_session_id_, read_only); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); auto ucmp = new_cfd->user_comparator(); diff --git a/db/column_family.h b/db/column_family.h index 51ad803b9002..60b3f15fa6c0 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -281,8 +281,9 @@ Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options); Status CheckCFPathsSupported(const DBOptions& db_options, const ColumnFamilyOptions& cf_options); -ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, - const ColumnFamilyOptions& src); +ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options, + bool read_only, + const ColumnFamilyOptions& src); // Wrap user defined table properties collector factories `from cf_options` // into internal ones in internal_tbl_prop_coll_factories. Add a system internal // one too. @@ -384,14 +385,17 @@ class ColumnFamilyData { Version* dummy_versions() { return dummy_versions_; } Version* current() { return current_; } // REQUIRE: DB mutex held void SetCurrent(Version* _current); // REQUIRE: DB mutex held - uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held - uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held - uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held - uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held + uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held + uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held // REQUIRE: DB mutex held void SetMemtable(MemTable* new_mem) { AssignMemtableID(new_mem); mem_ = new_mem; + if (ioptions_.disallow_memtable_writes) { + mem_->MarkImmutable(); + } } void AssignMemtableID(ReadOnlyMemTable* new_imm) { @@ -420,7 +424,8 @@ class ColumnFamilyData { const MutableCFOptions& mutable_options, const MutableDBOptions& mutable_db_options, const std::vector& existing_snapshots, - const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer); + const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + bool require_max_output_level = false); // Check if the passed range overlap with any running compactions. // REQUIRES: DB mutex held @@ -487,15 +492,11 @@ class ColumnFamilyData { uint64_t GetSuperVersionNumberRelaxed() const { return super_version_number_.load(std::memory_order_relaxed); } - // will return a pointer to SuperVersion* if previous SuperVersion - // if its reference count is zero and needs deletion or nullptr if not - // As argument takes a pointer to allocated SuperVersion to enable - // the clients to allocate SuperVersion outside of mutex. - // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() - void InstallSuperVersion(SuperVersionContext* sv_context, - const MutableCFOptions& mutable_cf_options); + // Only intended for use by DBImpl::InstallSuperVersion() and variants void InstallSuperVersion(SuperVersionContext* sv_context, - InstrumentedMutex* db_mutex); + InstrumentedMutex* db_mutex, + std::optional> + new_seqno_to_time_mapping = {}); void ResetThreadLocalSuperVersions(); @@ -537,6 +538,12 @@ class ColumnFamilyData { assert(!ts_low.empty()); const Comparator* ucmp = user_comparator(); assert(ucmp); + // Guard against resurrected full_history_ts_low persisted in MANIFEST + // from previous DB sessions. This could happen if UDT was enabled and then + // disabled. + if (ucmp->timestamp_size() == 0) { + return; + } if (full_history_ts_low_.empty() || ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) { full_history_ts_low_ = std::move(ts_low); @@ -544,6 +551,11 @@ class ColumnFamilyData { } const std::string& GetFullHistoryTsLow() const { + const Comparator* ucmp = user_comparator(); + assert(ucmp); + if (ucmp->timestamp_size() == 0) { + assert(full_history_ts_low_.empty()); + } return full_history_ts_low_; } @@ -588,18 +600,21 @@ class ColumnFamilyData { return (mem_->IsEmpty() ? 0 : 1) + imm_.NumNotFlushed(); } + // thread-safe, DB mutex not needed. + bool AllowIngestBehind() const { + return ioptions_.cf_allow_ingest_behind || ioptions_.allow_ingest_behind; + } + private: friend class ColumnFamilySet; - ColumnFamilyData(uint32_t id, const std::string& name, - Version* dummy_versions, Cache* table_cache, - WriteBufferManager* write_buffer_manager, - const ColumnFamilyOptions& options, - const ImmutableDBOptions& db_options, - const FileOptions* file_options, - ColumnFamilySet* column_family_set, - BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer, - const std::string& db_id, const std::string& db_session_id); + ColumnFamilyData( + uint32_t id, const std::string& name, Version* dummy_versions, + Cache* table_cache, WriteBufferManager* write_buffer_manager, + const ColumnFamilyOptions& options, const ImmutableDBOptions& db_options, + const FileOptions* file_options, ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, const std::string& db_id, + const std::string& db_session_id, bool read_only); std::vector GetDbPaths() const; @@ -761,7 +776,8 @@ class ColumnFamilySet { ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, Version* dummy_version, - const ColumnFamilyOptions& options); + const ColumnFamilyOptions& options, + bool read_only); const UnorderedMap& GetRunningColumnFamiliesTimestampSize() const { diff --git a/db/column_family_test.cc b/db/column_family_test.cc index 29ff2d15adbf..7cb505179c38 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -72,7 +72,6 @@ class ColumnFamilyTestBase : public testing::Test { env_->skip_fsync_ = true; dbname_ = test::PerThreadDBPath("column_family_test"); db_options_.create_if_missing = true; - db_options_.fail_if_options_file_error = true; db_options_.env = env_; } @@ -119,8 +118,7 @@ class ColumnFamilyTestBase : public testing::Test { for (int i = 0; i < n; i++) { if (flush_every != 0 && i != 0 && i % flush_every == 0) { - DBImpl* dbi = static_cast_with_check(db_); - dbi->TEST_FlushMemTable(); + dbfull()->TEST_FlushMemTable(); } int keyi = base + i; @@ -178,8 +176,7 @@ class ColumnFamilyTestBase : public testing::Test { } handles_.clear(); names_.clear(); - delete db_; - db_ = nullptr; + db_.reset(); } Status TryOpen(std::vector cf, @@ -219,7 +216,7 @@ class ColumnFamilyTestBase : public testing::Test { void Open() { Open({"default"}); } - DBImpl* dbfull() { return static_cast_with_check(db_); } + DBImpl* dbfull() { return static_cast_with_check(db_.get()); } int GetProperty(int cf, std::string property) { std::string value; @@ -271,7 +268,8 @@ class ColumnFamilyTestBase : public testing::Test { // them. ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( ConfigOptions(), desc.options, - SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt))); + SanitizeCfOptions(dbfull()->immutable_db_options(), + /*read_only*/ false, current_cf_opt))); cfi++; } } @@ -500,7 +498,7 @@ class ColumnFamilyTestBase : public testing::Test { ColumnFamilyOptions column_family_options_; DBOptions db_options_; std::string dbname_; - DB* db_ = nullptr; + std::unique_ptr db_; EnvCounter* env_; std::shared_ptr env_guard_; Random rnd_; @@ -517,7 +515,7 @@ class ColumnFamilyTest INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest, testing::Values(test::kDefaultFormatVersion)); INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest, - testing::Values(kLatestFormatVersion)); + testing::Values(kLatestBbtFormatVersion)); TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) { for (int iter = 0; iter < 3; ++iter) { @@ -707,8 +705,8 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(test::kDefaultFormatVersion, false))); INSTANTIATE_TEST_CASE_P( FormatLatest, FlushEmptyCFTestWithParam, - testing::Values(std::make_tuple(kLatestFormatVersion, true), - std::make_tuple(kLatestFormatVersion, false))); + testing::Values(std::make_tuple(kLatestBbtFormatVersion, true), + std::make_tuple(kLatestBbtFormatVersion, false))); TEST_P(ColumnFamilyTest, AddDrop) { Open(); @@ -2175,7 +2173,7 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) { ASSERT_TRUE(has_cf2_sst); ASSERT_OK(Flush(0)); - ASSERT_EQ(0, dbfull()->TEST_total_log_size()); + ASSERT_EQ(0, dbfull()->TEST_wals_total_size()); Close(); } @@ -2232,7 +2230,7 @@ TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) { ASSERT_EQ(my_fs->options_files_created.load(), 2); } -TEST_P(ColumnFamilyTest, SanitizeOptions) { +TEST_P(ColumnFamilyTest, SanitizeCfOptions) { DBOptions db_options; for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) { for (int l = 0; l <= 2; l++) { @@ -2248,8 +2246,8 @@ TEST_P(ColumnFamilyTest, SanitizeOptions) { original.write_buffer_size = l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k; - ColumnFamilyOptions result = - SanitizeOptions(ImmutableDBOptions(db_options), original); + ColumnFamilyOptions result = SanitizeCfOptions( + ImmutableDBOptions(db_options), /*read_only*/ false, original); ASSERT_TRUE(result.level0_stop_writes_trigger >= result.level0_slowdown_writes_trigger); ASSERT_TRUE(result.level0_slowdown_writes_trigger >= @@ -3542,11 +3540,10 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) { // Re-open and verify the keys. Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); - DBImpl* dbi = static_cast_with_check(db_); for (int cf = 1; cf != 3; ++cf) { ReadOptions read_options; read_options.readahead_size = 0; - auto it = dbi->NewIterator(read_options, handles_[cf]); + auto it = db_->NewIterator(read_options, handles_[cf]); for (it->SeekToFirst(); it->Valid(); it->Next()) { ASSERT_OK(it->status()); Slice key(it->key()); @@ -3636,7 +3633,7 @@ TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) { // the behavior of manual flush is that it skips retaining UDTs. class ColumnFamilyRetainUDTTest : public ColumnFamilyTestBase { public: - ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestFormatVersion) {} + ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestBbtFormatVersion) {} void SetUp() override { db_options_.allow_concurrent_memtable_write = false; @@ -3886,7 +3883,7 @@ TEST_F(ManualFlushSkipRetainUDTTest, FlushRemovesStaleEntries) { static_cast_with_check(cfh)->cfd(); for (int version = 0; version < 100; version++) { if (version == 50) { - ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable(cfd)); + ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd)); } ASSERT_OK( Put(0, "foo", EncodeAsUint64(version), "v" + std::to_string(version))); diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index d037f53accb9..62669bc1bdb2 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -75,10 +75,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { options.level0_file_num_compaction_trigger = kLevel0Trigger; options.compression = kNoCompression; - DB* db = nullptr; + std::unique_ptr db; ASSERT_OK(DestroyDB(db_name_, options)); - Status s = DB::Open(options, db_name_, &db); - assert(s.ok()); + ASSERT_OK(DB::Open(options, db_name_, &db)); assert(db); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ @@ -114,7 +113,6 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { } } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - delete db; } TEST_F(CompactFilesTest, MultipleLevel) { @@ -128,11 +126,11 @@ TEST_F(CompactFilesTest, MultipleLevel) { FlushedFileCollector* collector = new FlushedFileCollector(); options.listeners.emplace_back(collector); - DB* db = nullptr; + std::unique_ptr db; ASSERT_OK(DestroyDB(db_name_, options)); Status s = DB::Open(options, db_name_, &db); ASSERT_OK(s); - ASSERT_NE(db, nullptr); + ASSERT_NE(db.get(), nullptr); // create couple files in L0, L3, L4 and L5 for (int i = 5; i > 2; --i) { @@ -141,7 +139,8 @@ TEST_F(CompactFilesTest, MultipleLevel) { ASSERT_OK(db->Flush(FlushOptions())); // Ensure background work is fully finished including listener callbacks // before accessing listener state. - ASSERT_OK(static_cast_with_check(db)->TEST_WaitForBackgroundWork()); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForBackgroundWork()); auto l0_files = collector->GetFlushedFiles(); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i)); @@ -191,8 +190,6 @@ TEST_F(CompactFilesTest, MultipleLevel) { ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5)); SyncPoint::GetInstance()->DisableProcessing(); thread.join(); - - delete db; } TEST_F(CompactFilesTest, ObsoleteFiles) { @@ -212,11 +209,11 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { FlushedFileCollector* collector = new FlushedFileCollector(); options.listeners.emplace_back(collector); - DB* db = nullptr; + std::unique_ptr db; ASSERT_OK(DestroyDB(db_name_, options)); Status s = DB::Open(options, db_name_, &db); ASSERT_OK(s); - ASSERT_NE(db, nullptr); + ASSERT_NE(db.get(), nullptr); // create couple files for (int i = 1000; i < 2000; ++i) { @@ -226,13 +223,12 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { auto l0_files = collector->GetFlushedFiles(); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); - ASSERT_OK(static_cast_with_check(db)->TEST_WaitForCompact()); + ASSERT_OK(static_cast_with_check(db.get())->TEST_WaitForCompact()); // verify all compaction input files are deleted for (const auto& fname : l0_files) { ASSERT_EQ(Status::NotFound(), env_->FileExists(fname)); } - delete db; } TEST_F(CompactFilesTest, NotCutOutputOnLevel0) { @@ -251,10 +247,9 @@ TEST_F(CompactFilesTest, NotCutOutputOnLevel0) { FlushedFileCollector* collector = new FlushedFileCollector(); options.listeners.emplace_back(collector); - DB* db = nullptr; + std::unique_ptr db; ASSERT_OK(DestroyDB(db_name_, options)); - Status s = DB::Open(options, db_name_, &db); - assert(s.ok()); + ASSERT_OK(DB::Open(options, db_name_, &db)); assert(db); // create couple files @@ -262,19 +257,20 @@ TEST_F(CompactFilesTest, NotCutOutputOnLevel0) { ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), std::string(1000, 'a' + (i % 26)))); } - ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); auto l0_files_1 = collector->GetFlushedFiles(); collector->ClearFlushedFiles(); for (int i = 0; i < 500; ++i) { ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), std::string(1000, 'a' + (i % 26)))); } - ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); auto l0_files_2 = collector->GetFlushedFiles(); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0)); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0)); // no assertion failure - delete db; } TEST_F(CompactFilesTest, CapturingPendingFiles) { @@ -289,7 +285,7 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) { FlushedFileCollector* collector = new FlushedFileCollector(); options.listeners.emplace_back(collector); - DB* db = nullptr; + std::unique_ptr db; ASSERT_OK(DestroyDB(db_name_, options)); Status s = DB::Open(options, db_name_, &db); ASSERT_OK(s); @@ -303,7 +299,8 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) { // Ensure background work is fully finished including listener callbacks // before accessing listener state. - ASSERT_OK(static_cast_with_check(db)->TEST_WaitForBackgroundWork()); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForBackgroundWork()); auto l0_files = collector->GetFlushedFiles(); EXPECT_EQ(5, l0_files.size()); @@ -327,13 +324,12 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - delete db; + db.reset(); // Make sure we can reopen the DB. s = DB::Open(options, db_name_, &db); ASSERT_OK(s); assert(db); - delete db; } TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { @@ -365,12 +361,12 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { options.create_if_missing = true; options.compaction_filter = cf.get(); - DB* db = nullptr; + std::unique_ptr db; ASSERT_OK(DestroyDB(db_name_, options)); Status s = DB::Open(options, db_name_, &db); ASSERT_OK(s); - cf->SetDB(db); + cf->SetDB(db.get()); // Write one L0 file ASSERT_OK(db->Put(WriteOptions(), "K1", "V1")); @@ -384,8 +380,6 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { ASSERT_OK( db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0)); } - - delete db; } TEST_F(CompactFilesTest, SentinelCompressionType) { @@ -413,7 +407,7 @@ TEST_F(CompactFilesTest, SentinelCompressionType) { options.create_if_missing = true; FlushedFileCollector* collector = new FlushedFileCollector(); options.listeners.emplace_back(collector); - DB* db = nullptr; + std::unique_ptr db; ASSERT_OK(DB::Open(options, db_name_, &db)); ASSERT_OK(db->Put(WriteOptions(), "key", "val")); @@ -421,7 +415,8 @@ TEST_F(CompactFilesTest, SentinelCompressionType) { // Ensure background work is fully finished including listener callbacks // before accessing listener state. - ASSERT_OK(static_cast_with_check(db)->TEST_WaitForBackgroundWork()); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForBackgroundWork()); auto l0_files = collector->GetFlushedFiles(); ASSERT_EQ(1, l0_files.size()); @@ -433,14 +428,18 @@ TEST_F(CompactFilesTest, SentinelCompressionType) { ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props; ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props)); for (const auto& name_and_table_props : all_tables_props) { - ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression), - name_and_table_props.second->compression_name); + // As of format_version 7, more elaborate information is encoded into the + // compression_name property + ASSERT_EQ("BuiltinV2;02;", name_and_table_props.second->compression_name); } - delete db; } } TEST_F(CompactFilesTest, CompressionWithBlockAlign) { + if (!Snappy_Supported()) { + ROCKSDB_GTEST_SKIP("Test requires Snappy support"); + return; + } Options options; options.compression = CompressionType::kNoCompression; options.create_if_missing = true; @@ -457,11 +456,7 @@ TEST_F(CompactFilesTest, CompressionWithBlockAlign) { } std::unique_ptr db; - { - DB* _db = nullptr; - ASSERT_OK(DB::Open(options, db_name_, &_db)); - db.reset(_db); - } + ASSERT_OK(DB::Open(options, db_name_, &db)); ASSERT_OK(db->Put(WriteOptions(), "key", "val")); ASSERT_OK(db->Flush(FlushOptions())); @@ -500,7 +495,7 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) { FlushedFileCollector* collector = new FlushedFileCollector(); options.listeners.emplace_back(collector); - DB* db = nullptr; + std::unique_ptr db; ASSERT_OK(DestroyDB(db_name_, options)); Status s = DB::Open(options, db_name_, &db); ASSERT_OK(s); @@ -511,7 +506,8 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) { ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), std::string(1000, 'a' + (i % 26)))); } - ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); auto l0_files_1 = collector->GetFlushedFiles(); CompactionOptions co; co.compression = CompressionType::kLZ4Compression; @@ -527,7 +523,228 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) { ASSERT_EQ(compaction_job_info.output_level, 0); ASSERT_OK(compaction_job_info.status); // no assertion failure - delete db; +} + +// Helper function to generate zero-padded keys +// e.g., MakeKey("a", 5) -> "a05", MakeKey("b", 42) -> "b42" +static std::string MakeKey(const std::string& prefix, int index) { + return prefix + (index < 10 ? "0" : "") + std::to_string(index); +} + +TEST_F(CompactFilesTest, TrivialMoveNonOverlappingFiles) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = false; + + std::unique_ptr db; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + ASSERT_NE(db.get(), nullptr); + + // Create 3 non-overlapping files in L0 + // File 1: keys [a00-a99] + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("a", i); + ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // File 2: keys [b00-b99] + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("b", i); + ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // File 3: keys [c00-c99] + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("c", i); + ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // Verify files are in L0 + ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[0].files.size(), 3); + ASSERT_EQ(meta.levels[1].files.size(), 0); + + // Get L0 files + std::vector l0_files; + for (const auto& file : meta.levels[0].files) { + l0_files.push_back(file.db_path + "/" + file.name); + } + + CompactionOptions compact_option; + compact_option.allow_trivial_move = true; + // Compact all L0 files to L1 (non-overlapping in L1) + ASSERT_OK(db->CompactFiles(compact_option, l0_files, 1)); + + // Verify files are now in L1 + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[0].files.size(), 0); + ASSERT_EQ(meta.levels[1].files.size(), 3); + + // Get the first file from L1 (should be the one with keys a00-a99) + std::string l1_file_to_move; + std::vector l1_files_to_move_later; + uint64_t l1_file_number = 0; + for (const auto& file : meta.levels[1].files) { + if (file.smallestkey[0] == 'a') { + l1_file_to_move = file.db_path + "/" + file.name; + l1_file_number = file.file_number; + } else { + l1_files_to_move_later.push_back(file.db_path + "/" + file.name); + } + } + ASSERT_FALSE(l1_file_to_move.empty()); + + // Set up sync point to verify trivial move path is taken + bool trivial_move_executed = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::CompactFilesImpl:TrivialMove", + [&](void* /*arg*/) { trivial_move_executed = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Move the file from L1 to L6 - this should be a trivial move + // because the file doesn't overlap with anything in L6 + std::vector files_to_move = {l1_file_to_move}; + ASSERT_OK(db->CompactFiles(compact_option, files_to_move, 6)); + + // Verify trivial move was executed + ASSERT_TRUE(trivial_move_executed); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Verify the file is now in L6 + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[1].files.size(), 2); // Two files remain in L1 + ASSERT_EQ(meta.levels[6].files.size(), 1); // One file in L6 + + // Verify it's the correct file in L6 + bool found_file_in_l6 = false; + for (const auto& file : meta.levels[6].files) { + if (file.file_number == l1_file_number) { + found_file_in_l6 = true; + // Verify key range hasn't changed + ASSERT_EQ(file.smallestkey[0], 'a'); + ASSERT_EQ(file.largestkey[0], 'a'); + break; + } + } + ASSERT_TRUE(found_file_in_l6); + + // Move the other 2 files from L1 to L6, with allow_trivial_move set to false. + // This will trigger a normal compaction, so the 2 files will be compacted + // into a single file in L6. + ASSERT_OK(db->CompactFiles(CompactionOptions(), l1_files_to_move_later, 6)); + + // Verify files in L6 + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[1].files.size(), 0); // Zero files remain in L1 + ASSERT_EQ(meta.levels[6].files.size(), 2); // Two file in L6 + + // Verify data integrity - all keys should still be readable + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("a", i); + std::string value; + ASSERT_OK(db->Get(ReadOptions(), key, &value)); + ASSERT_EQ(value, "value_" + key); + } +} + +TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = false; + options.num_levels = 7; + + std::unique_ptr db; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + ASSERT_NE(db.get(), nullptr); + + // Create a file in L6 with keys [m00-m99] (wide range) + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("m", i); + ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // Get L0 file + ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + std::vector l0_files; + for (const auto& file : meta.levels[0].files) { + l0_files.push_back(file.db_path + "/" + file.name); + } + + CompactionOptions compact_option; + compact_option.allow_trivial_move = true; + + // Move to L6 + ASSERT_OK(db->CompactFiles(compact_option, l0_files, 6)); + + // Now create a file in L1 with overlapping keys [m50-m60] + for (int i = 50; i <= 60; i++) { + std::string key = "m" + std::to_string(i); + ASSERT_OK(db->Put(WriteOptions(), key, "updated_value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // Get the L0 file + db->GetColumnFamilyMetaData(&meta); + std::vector l0_files_2; + for (const auto& file : meta.levels[0].files) { + l0_files_2.push_back(file.db_path + "/" + file.name); + } + + // Move to L1 + ASSERT_OK(db->CompactFiles(compact_option, l0_files_2, 1)); + + // Get the L1 file + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[1].files.size(), 1); + std::string l1_file = + meta.levels[1].files[0].db_path + "/" + meta.levels[1].files[0].name; + + // Set up sync point to verify full compaction path is taken + bool trivial_move_executed = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::CompactFilesImpl:TrivialMove", + [&](void* /*arg*/) { trivial_move_executed = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Try to move from L1 to L6 - this should NOT be a trivial move + // because the file overlaps with the existing file in L6 + ASSERT_OK(db->CompactFiles(compact_option, {l1_file}, 6)); + + // Verify trivial move was NOT executed (full compaction happened) + ASSERT_FALSE(trivial_move_executed); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Verify the result - should have merged data in L6 + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[1].files.size(), 0); // L1 should be empty + // L6 should have the merged file (may be 1 file if merged, or 2 if not) + ASSERT_GE(meta.levels[6].files.size(), 1); + + // Verify updated values are present + for (int i = 50; i <= 60; i++) { + std::string key = "m" + std::to_string(i); + std::string value; + ASSERT_OK(db->Get(ReadOptions(), key, &value)); + ASSERT_EQ(value, "updated_value_" + key); + } } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 313e2998aecd..9609f17c80f0 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -281,12 +281,13 @@ Compaction::Compaction( std::vector _inputs, int _output_level, uint64_t _target_file_size, uint64_t _max_compaction_bytes, uint32_t _output_path_id, CompressionType _compression, - CompressionOptions _compression_opts, Temperature _output_temperature, - uint32_t _max_subcompactions, std::vector _grandparents, + CompressionOptions _compression_opts, + Temperature _output_temperature_override, uint32_t _max_subcompactions, + std::vector _grandparents, std::optional _earliest_snapshot, - const SnapshotChecker* _snapshot_checker, bool _manual_compaction, - const std::string& _trim_ts, double _score, bool _deletion_compaction, - bool l0_files_might_overlap, CompactionReason _compaction_reason, + const SnapshotChecker* _snapshot_checker, + CompactionReason _compaction_reason, const std::string& _trim_ts, + double _score, bool l0_files_might_overlap, BlobGarbageCollectionPolicy _blob_garbage_collection_policy, double _blob_garbage_collection_age_cutoff) : input_vstorage_(vstorage), @@ -303,8 +304,10 @@ Compaction::Compaction( output_path_id_(_output_path_id), output_compression_(_compression), output_compression_opts_(_compression_opts), - output_temperature_(_output_temperature), - deletion_compaction_(_deletion_compaction), + output_temperature_override_(_output_temperature_override), + deletion_compaction_(_compaction_reason == CompactionReason::kFIFOTtl || + _compaction_reason == + CompactionReason::kFIFOMaxSize), l0_files_might_overlap_(l0_files_might_overlap), inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), grandparents_(std::move(_grandparents)), @@ -321,7 +324,8 @@ Compaction::Compaction( ? false : IsBottommostLevel(output_level_, vstorage, inputs_)), is_full_compaction_(IsFullCompaction(vstorage, inputs_)), - is_manual_compaction_(_manual_compaction), + is_manual_compaction_(_compaction_reason == + CompactionReason::kManualCompaction), trim_ts_(_trim_ts), is_trivial_move_(false), compaction_reason_(_compaction_reason), @@ -338,20 +342,17 @@ Compaction::Compaction( _blob_garbage_collection_age_cutoff > 1 ? mutable_cf_options().blob_garbage_collection_age_cutoff : _blob_garbage_collection_age_cutoff), - penultimate_level_( - // For simplicity, we don't support the concept of "penultimate level" + proximal_level_( + // For simplicity, we don't support the concept of "proximal level" // with `CompactionReason::kExternalSstIngestion` and // `CompactionReason::kRefitLevel` _compaction_reason == CompactionReason::kExternalSstIngestion || _compaction_reason == CompactionReason::kRefitLevel ? Compaction::kInvalidLevel - : EvaluatePenultimateLevel(vstorage, mutable_cf_options_, - immutable_options_, start_level_, - output_level_)) { + : EvaluateProximalLevel(vstorage, mutable_cf_options_, + immutable_options_, start_level_, + output_level_)) { MarkFilesBeingCompacted(true); - if (is_manual_compaction_) { - compaction_reason_ = CompactionReason::kManualCompaction; - } if (max_subcompactions_ == 0) { max_subcompactions_ = _mutable_db_options.max_subcompactions; } @@ -405,10 +406,10 @@ Compaction::Compaction( } } - PopulatePenultimateLevelOutputRange(); + PopulateProximalLevelOutputRange(); } -void Compaction::PopulatePenultimateLevelOutputRange() { +void Compaction::PopulateProximalLevelOutputRange() { if (!SupportsPerKeyPlacement()) { assert(keep_in_last_level_through_seqno_ == kMaxSequenceNumber); return; @@ -417,46 +418,42 @@ void Compaction::PopulatePenultimateLevelOutputRange() { // exclude the last level, the range of all input levels is the safe range // of keys that can be moved up. int exclude_level = number_levels_ - 1; - penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange; + proximal_output_range_type_ = ProximalOutputRangeType::kNonLastRange; - // For universal compaction, the penultimate_output_range could be extended if - // all penultimate level files are included in the compaction (which includes - // the case that the penultimate level is empty). + // For universal compaction, the proximal_output_range could be extended if + // all proximal level files are included in the compaction (which includes + // the case that the proximal level is empty). if (immutable_options_.compaction_style == kCompactionStyleUniversal) { exclude_level = kInvalidLevel; - penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange; - std::set penultimate_inputs; + proximal_output_range_type_ = ProximalOutputRangeType::kFullRange; + std::set proximal_inputs; for (const auto& input_lvl : inputs_) { - if (input_lvl.level == penultimate_level_) { + if (input_lvl.level == proximal_level_) { for (const auto& file : input_lvl.files) { - penultimate_inputs.emplace(file->fd.GetNumber()); + proximal_inputs.emplace(file->fd.GetNumber()); } } } - auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_); - for (const auto& file : penultimate_files) { - if (penultimate_inputs.find(file->fd.GetNumber()) == - penultimate_inputs.end()) { + auto proximal_files = input_vstorage_->LevelFiles(proximal_level_); + for (const auto& file : proximal_files) { + if (proximal_inputs.find(file->fd.GetNumber()) == proximal_inputs.end()) { exclude_level = number_levels_ - 1; - penultimate_output_range_type_ = - PenultimateOutputRangeType::kNonLastRange; + proximal_output_range_type_ = ProximalOutputRangeType::kNonLastRange; break; } } } - // FIXME: should make use of `penultimate_output_range_type_`. + // FIXME: should make use of `proximal_output_range_type_`. // FIXME: when last level's input range does not overlap with - // penultimate level, and penultimate level input is empty, - // this call will not set penultimate_level_smallest_ or - // penultimate_level_largest_. No keys will be compacted up. - GetBoundaryInternalKeys(input_vstorage_, inputs_, - &penultimate_level_smallest_, - &penultimate_level_largest_, exclude_level); - - if (penultimate_output_range_type_ != - PenultimateOutputRangeType::kFullRange) { - // If not full range in penultimate level, must keep everything already + // proximal level, and proximal level input is empty, + // this call will not set proximal_level_smallest_ or + // proximal_level_largest_. No keys will be compacted up. + GetBoundaryInternalKeys(input_vstorage_, inputs_, &proximal_level_smallest_, + &proximal_level_largest_, exclude_level); + + if (proximal_output_range_type_ != ProximalOutputRangeType::kFullRange) { + // If not full range in proximal level, must keep everything already // in the last level there, because moving it back up might cause // overlap/placement issues that are difficult to resolve properly in the // presence of range deletes @@ -486,23 +483,23 @@ Compaction::~Compaction() { } bool Compaction::SupportsPerKeyPlacement() const { - return penultimate_level_ != kInvalidLevel; + return proximal_level_ != kInvalidLevel; } -int Compaction::GetPenultimateLevel() const { return penultimate_level_; } +int Compaction::GetProximalLevel() const { return proximal_level_; } // smallest_key and largest_key include timestamps if user-defined timestamp is // enabled. -bool Compaction::OverlapPenultimateLevelOutputRange( +bool Compaction::OverlapProximalLevelOutputRange( const Slice& smallest_key, const Slice& largest_key) const { if (!SupportsPerKeyPlacement()) { return false; } - // See FIXME in Compaction::PopulatePenultimateLevelOutputRange(). + // See FIXME in Compaction::PopulateProximalLevelOutputRange(). // We do not compact any key up in this case. - if (penultimate_level_smallest_.size() == 0 || - penultimate_level_largest_.size() == 0) { + if (proximal_level_smallest_.size() == 0 || + proximal_level_largest_.size() == 0) { return false; } @@ -510,13 +507,13 @@ bool Compaction::OverlapPenultimateLevelOutputRange( input_vstorage_->InternalComparator()->user_comparator(); return ucmp->CompareWithoutTimestamp( - smallest_key, penultimate_level_largest_.user_key()) <= 0 && + smallest_key, proximal_level_largest_.user_key()) <= 0 && ucmp->CompareWithoutTimestamp( - largest_key, penultimate_level_smallest_.user_key()) >= 0; + largest_key, proximal_level_smallest_.user_key()) >= 0; } // key includes timestamp if user-defined timestamp is enabled. -void Compaction::TEST_AssertWithinPenultimateLevelOutputRange( +void Compaction::TEST_AssertWithinProximalLevelOutputRange( const Slice& user_key, bool expect_failure) const { #ifdef NDEBUG (void)user_key; @@ -524,15 +521,15 @@ void Compaction::TEST_AssertWithinPenultimateLevelOutputRange( #else assert(SupportsPerKeyPlacement()); - assert(penultimate_level_smallest_.size() > 0); - assert(penultimate_level_largest_.size() > 0); + assert(proximal_level_smallest_.size() > 0); + assert(proximal_level_largest_.size() > 0); auto* cmp = input_vstorage_->user_comparator(); // op_type of a key can change during compaction, e.g. Merge -> Put. - if (!(cmp->Compare(user_key, penultimate_level_smallest_.user_key()) >= 0)) { + if (!(cmp->Compare(user_key, proximal_level_smallest_.user_key()) >= 0)) { assert(expect_failure); - } else if (!(cmp->Compare(user_key, penultimate_level_largest_.user_key()) <= + } else if (!(cmp->Compare(user_key, proximal_level_largest_.user_key()) <= 0)) { assert(expect_failure); } else { @@ -651,6 +648,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel( return true; } else if (output_level_ != 0 && cfd_->ioptions().compaction_style == kCompactionStyleLevel) { + // TODO: apply the optimization here to other compaction styles and + // compaction/flush to L0. // Maybe use binary search to find right entry instead of linear search? const Comparator* user_cmp = cfd_->user_comparator(); for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { @@ -1018,7 +1017,7 @@ uint64_t Compaction::MinInputFileEpochNumber() const { return min_epoch_number; } -int Compaction::EvaluatePenultimateLevel( +int Compaction::EvaluateProximalLevel( const VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, const ImmutableOptions& immutable_options, const int start_level, @@ -1033,21 +1032,21 @@ int Compaction::EvaluatePenultimateLevel( return kInvalidLevel; } - int penultimate_level = output_level - 1; - assert(penultimate_level < immutable_options.num_levels); - if (penultimate_level <= 0) { + int proximal_level = output_level - 1; + assert(proximal_level < immutable_options.num_levels); + if (proximal_level <= 0) { return kInvalidLevel; } - // If the penultimate level is not within input level -> output level range - // check if the penultimate output level is empty, if it's empty, it could - // also be locked for the penultimate output. + // If the proximal level is not within input level -> output level range + // check if the proximal output level is empty, if it's empty, it could + // also be locked for the proximal output. // TODO: ideally, it only needs to check if there's a file within the // compaction output key range. For simplicity, it just check if there's any - // file on the penultimate level. + // file on the proximal level. if (start_level == immutable_options.num_levels - 1 && (immutable_options.compaction_style != kCompactionStyleUniversal || - !vstorage->LevelFiles(penultimate_level).empty())) { + !vstorage->LevelFiles(proximal_level).empty())) { return kInvalidLevel; } @@ -1061,7 +1060,7 @@ int Compaction::EvaluatePenultimateLevel( return kInvalidLevel; } - return penultimate_level; + return proximal_level; } void Compaction::FilterInputsForCompactionIterator() { @@ -1130,4 +1129,17 @@ void Compaction::FilterInputsForCompactionIterator() { } } +Temperature Compaction::GetOutputTemperature(bool is_proximal_level) const { + if (output_temperature_override_ != Temperature::kUnknown) { + return output_temperature_override_; + } + + if (is_last_level() && !is_proximal_level && + mutable_cf_options_.last_level_temperature != Temperature::kUnknown) { + return mutable_cf_options_.last_level_temperature; + } + + return mutable_cf_options_.default_write_temperature; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index 534b13c6a8f8..44eb876ac71a 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -90,25 +90,25 @@ class Compaction { uint64_t target_file_size, uint64_t max_compaction_bytes, uint32_t output_path_id, CompressionType compression, CompressionOptions compression_opts, - Temperature output_temperature, uint32_t max_subcompactions, + Temperature output_temperature_override, + uint32_t max_subcompactions, std::vector grandparents, std::optional earliest_snapshot, const SnapshotChecker* snapshot_checker, - bool manual_compaction = false, const std::string& trim_ts = "", - double score = -1, bool deletion_compaction = false, + CompactionReason compaction_reason, + const std::string& trim_ts = "", double score = -1, bool l0_files_might_overlap = true, - CompactionReason compaction_reason = CompactionReason::kUnknown, BlobGarbageCollectionPolicy blob_garbage_collection_policy = BlobGarbageCollectionPolicy::kUseDefault, double blob_garbage_collection_age_cutoff = -1); - // The type of the penultimate level output range - enum class PenultimateOutputRangeType : int { - kNotSupported, // it cannot output to the penultimate level - kFullRange, // any data could be output to the penultimate level + // The type of the proximal level output range + enum class ProximalOutputRangeType : int { + kNotSupported, // it cannot output to the proximal level + kFullRange, // any data could be output to the proximal level kNonLastRange, // only the keys within non_last_level compaction inputs can - // be outputted to the penultimate level - kDisabled, // no data can be outputted to the penultimate level + // be outputted to the proximal level + kDisabled, // no data can be outputted to the proximal level }; // No copying allowed @@ -180,6 +180,10 @@ class Compaction { const std::vector* inputs() { return &inputs_; } // Returns the LevelFilesBrief of the specified compaction input level. + // Note that if the compaction includes standalone range deletion file, + // this function returns the result after filtering out input files covered + // by the range deletion file. + // Use inputs() if you want to get the original input files. const LevelFilesBrief* input_levels(size_t compaction_input_level) const { return &input_levels_[compaction_input_level]; } @@ -283,6 +287,13 @@ class Compaction { // are non-overlapping and can be trivially moved. bool is_trivial_move() const { return is_trivial_move_; } + bool is_trivial_copy_compaction() const { + return immutable_options_.compaction_style == kCompactionStyleFIFO && + compaction_reason_ == CompactionReason::kChangeTemperature && + mutable_cf_options_.compaction_options_fifo + .allow_trivial_copy_when_change_temperature; + } + // How many total levels are there? int number_levels() const { return number_levels_; } @@ -370,29 +381,29 @@ class Compaction { Slice GetLargestUserKey() const { return largest_user_key_; } - PenultimateOutputRangeType GetPenultimateOutputRangeType() const { - return penultimate_output_range_type_; + ProximalOutputRangeType GetProximalOutputRangeType() const { + return proximal_output_range_type_; } // Return true if the compaction supports per_key_placement bool SupportsPerKeyPlacement() const; - // Get per_key_placement penultimate output level, which is `last_level - 1` + // Get per_key_placement proximal output level, which is `last_level - 1` // if per_key_placement feature is supported. Otherwise, return -1. - int GetPenultimateLevel() const; + int GetProximalLevel() const; - // Return true if the given range is overlap with penultimate level output + // Return true if the given range is overlap with proximal level output // range. // Both smallest_key and largest_key include timestamps if user-defined // timestamp is enabled. - bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key, - const Slice& largest_key) const; + bool OverlapProximalLevelOutputRange(const Slice& smallest_key, + const Slice& largest_key) const; - // For testing purposes, check that a key is within penultimate level + // For testing purposes, check that a key is within proximal level // output range for per_key_placement feature, which is safe to place the key - // to the penultimate level. Different compaction strategies have different + // to the proximal level. Different compaction strategies have different // rules. `user_key` includes timestamp if user-defined timestamp is enabled. - void TEST_AssertWithinPenultimateLevelOutputRange( + void TEST_AssertWithinProximalLevelOutputRange( const Slice& user_key, bool expect_failure = false) const; CompactionReason compaction_reason() const { return compaction_reason_; } @@ -403,7 +414,11 @@ class Compaction { uint64_t max_compaction_bytes() const { return max_compaction_bytes_; } - Temperature output_temperature() const { return output_temperature_; } + // Order of precedence for temperature: + // 1. Override temp if not kUnknown + // 2. Temperature of the last level files if applicable + // 3. Default write temperature + Temperature GetOutputTemperature(bool is_proximal_level = false) const; uint32_t max_subcompactions() const { return max_subcompactions_; } @@ -441,20 +456,25 @@ class Compaction { static constexpr int kInvalidLevel = -1; - // Evaluate penultimate output level. If the compaction supports - // per_key_placement feature, it returns the penultimate level number. + // Evaluate proximal output level. If the compaction supports + // per_key_placement feature, it returns the proximal level number. // Otherwise, it's set to kInvalidLevel (-1), which means - // output_to_penultimate_level is not supported. - // Note: even the penultimate level output is supported (PenultimateLevel != + // output_to_proximal_level is not supported. + // Note: even the proximal level output is supported (ProximalLevel != // kInvalidLevel), some key range maybe unsafe to be outputted to the - // penultimate level. The safe key range is populated by - // `PopulatePenultimateLevelOutputRange()`. - // Which could potentially disable all penultimate level output. - static int EvaluatePenultimateLevel( - const VersionStorageInfo* vstorage, - const MutableCFOptions& mutable_cf_options, - const ImmutableOptions& immutable_options, const int start_level, - const int output_level); + // proximal level. The safe key range is populated by + // `PopulateProximalLevelOutputRange()`. + // Which could potentially disable all proximal level output. + static int EvaluateProximalLevel(const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + const ImmutableOptions& immutable_options, + const int start_level, + const int output_level); + + static bool OutputToNonZeroMaxOutputLevel(int output_level, + int max_output_level) { + return output_level > 0 && output_level == max_output_level; + } // If some data cannot be safely migrated "up" the LSM tree due to a change // in the preclude_last_level_data_seconds setting, this indicates a sequence @@ -482,10 +502,10 @@ class Compaction { InternalKey* smallest_key, InternalKey* largest_key, int exclude_level = -1); - // populate penultimate level output range, which will be used to determine if - // a key is safe to output to the penultimate level (details see - // `Compaction::WithinPenultimateLevelOutputRange()`. - void PopulatePenultimateLevelOutputRange(); + // populate proximal level output range, which will be used to determine if + // a key is safe to output to the proximal level (details see + // `Compaction::WithinProximalLevelOutputRange()`. + void PopulateProximalLevelOutputRange(); // If oldest snapshot is specified at Compaction construction time, we have // an opportunity to optimize inputs for compaction iterator for this case: @@ -530,7 +550,7 @@ class Compaction { const uint32_t output_path_id_; CompressionType output_compression_; CompressionOptions output_compression_opts_; - Temperature output_temperature_; + Temperature output_temperature_override_; // If true, then the compaction can be done by simply deleting input files. const bool deletion_compaction_; // should it split the output file using the compact cursor? @@ -616,20 +636,20 @@ class Compaction { // only set when per_key_placement feature is enabled, -1 (kInvalidLevel) // means not supported. - const int penultimate_level_; + const int proximal_level_; - // Key range for penultimate level output + // Key range for proximal level output // includes timestamp if user-defined timestamp is enabled. - // penultimate_output_range_type_ shows the range type - InternalKey penultimate_level_smallest_; - InternalKey penultimate_level_largest_; - PenultimateOutputRangeType penultimate_output_range_type_ = - PenultimateOutputRangeType::kNotSupported; + // proximal_output_range_type_ shows the range type + InternalKey proximal_level_smallest_; + InternalKey proximal_level_largest_; + ProximalOutputRangeType proximal_output_range_type_ = + ProximalOutputRangeType::kNotSupported; }; #ifndef NDEBUG // Helper struct only for tests, which contains the data to decide if a key -// should be output to the penultimate level. +// should be output to the proximal level. // TODO: remove this when the public feature knob is available struct PerKeyPlacementContext { const int level; @@ -637,16 +657,16 @@ struct PerKeyPlacementContext { const Slice value; const SequenceNumber seq_num; - bool& output_to_penultimate_level; + bool& output_to_proximal_level; PerKeyPlacementContext(int _level, Slice _key, Slice _value, SequenceNumber _seq_num, - bool& _output_to_penultimate_level) + bool& _output_to_proximal_level) : level(_level), key(_key), value(_value), seq_num(_seq_num), - output_to_penultimate_level(_output_to_penultimate_level) {} + output_to_proximal_level(_output_to_proximal_level) {} }; #endif /* !NDEBUG */ diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index dc441817c6cc..e76490225c26 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -28,7 +28,7 @@ CompactionIterator::CompactionIterator( SequenceNumber earliest_snapshot, SequenceNumber earliest_write_conflict_snapshot, SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, - Env* env, bool report_detailed_time, bool expect_valid_internal_key, + Env* env, bool report_detailed_time, CompactionRangeDelAggregator* range_del_agg, BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, bool enforce_single_del_contracts, @@ -42,8 +42,8 @@ CompactionIterator::CompactionIterator( : CompactionIterator( input, cmp, merge_helper, last_sequence, snapshots, earliest_snapshot, earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, - report_detailed_time, expect_valid_internal_key, range_del_agg, - blob_file_builder, allow_data_in_errors, enforce_single_del_contracts, + report_detailed_time, range_del_agg, blob_file_builder, + allow_data_in_errors, enforce_single_del_contracts, manual_compaction_canceled, compaction ? std::make_unique(compaction) : nullptr, must_count_input_entries, compaction_filter, shutting_down, info_log, @@ -55,7 +55,7 @@ CompactionIterator::CompactionIterator( SequenceNumber earliest_snapshot, SequenceNumber earliest_write_conflict_snapshot, SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, - Env* env, bool report_detailed_time, bool expect_valid_internal_key, + Env* env, bool report_detailed_time, CompactionRangeDelAggregator* range_del_agg, BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, bool enforce_single_del_contracts, @@ -76,16 +76,14 @@ CompactionIterator::CompactionIterator( env_(env), clock_(env_->GetSystemClock().get()), report_detailed_time_(report_detailed_time), - expect_valid_internal_key_(expect_valid_internal_key), range_del_agg_(range_del_agg), blob_file_builder_(blob_file_builder), compaction_(std::move(compaction)), compaction_filter_(compaction_filter), shutting_down_(shutting_down), manual_compaction_canceled_(manual_compaction_canceled), - bottommost_level_(!compaction_ ? false - : compaction_->bottommost_level() && - !compaction_->allow_ingest_behind()), + bottommost_level_(compaction_ && compaction_->bottommost_level() && + !compaction_->allow_ingest_behind()), // snapshots_ cannot be nullptr, but we will assert later in the body of // the constructor. visible_at_tip_(snapshots_ ? snapshots_->empty() : false), @@ -161,6 +159,7 @@ void CompactionIterator::Next() { // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to be valid. if (!s.ok()) { + // FIXME: should fail compaction after this fatal logging. ROCKS_LOG_FATAL( info_log_, "Invalid ikey %s in compaction. %s", allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden", @@ -464,18 +463,9 @@ void CompactionIterator::NextFromInput() { if (!pik_status.ok()) { iter_stats_.num_input_corrupt_records++; - // If `expect_valid_internal_key_` is false, return the corrupted key - // and let the caller decide what to do with it. - if (expect_valid_internal_key_) { - status_ = pik_status; - return; - } - key_ = current_key_.SetInternalKey(key_); - has_current_user_key_ = false; - current_user_key_sequence_ = kMaxSequenceNumber; - current_user_key_snapshot_ = 0; - validity_info_.SetValid(ValidContext::kParseKeyError); - break; + // Always fail compaction when encountering corrupted internal keys + status_ = pik_status; + return; } TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); if (is_range_del_) { @@ -642,7 +632,8 @@ void CompactionIterator::NextFromInput() { } else if (ikey_.type == kTypeSingleDeletion) { // We can compact out a SingleDelete if: // 1) We encounter the corresponding PUT -OR- we know that this key - // doesn't appear past this output level + // doesn't appear past this output level and we are not in + // ingest_behind mode. // =AND= // 2) We've already returned a record in this snapshot -OR- // there are no earlier earliest_write_conflict_snapshot. @@ -731,6 +722,8 @@ void CompactionIterator::NextFromInput() { "CompactionIterator::NextFromInput:SingleDelete:1", const_cast(c)); if (last_key_seq_zeroed_) { + // Drop SD and the next key since they are both in the last + // snapshot (since last key has seqno zeroed). ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_obsolete; assert(bottommost_level_); @@ -841,7 +834,7 @@ void CompactionIterator::NextFromInput() { // iteration. If the next key is corrupt, we return before the // comparison, so the value of has_current_user_key does not matter. has_current_user_key_ = false; - if (compaction_ != nullptr && + if (compaction_ != nullptr && !compaction_->allow_ingest_behind() && DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, &level_ptrs_) && @@ -854,6 +847,9 @@ void CompactionIterator::NextFromInput() { ++iter_stats_.num_optimized_del_drop_obsolete; } } else if (last_key_seq_zeroed_) { + // Sequence number zeroing requires bottommost_level_, which is + // false with ingest_behind. + assert(!compaction_->allow_ingest_behind()); // Skip. ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_obsolete; @@ -870,6 +866,7 @@ void CompactionIterator::NextFromInput() { } else if (last_sequence != kMaxSequenceNumber && (last_snapshot == current_user_key_snapshot_ || last_snapshot < current_user_key_snapshot_)) { + // rule (A): // If the earliest snapshot is which this key is visible in // is the same as the visibility of a previous instance of the // same key, then this kv is not visible in any snapshot. @@ -878,6 +875,15 @@ void CompactionIterator::NextFromInput() { // Note: Dropping this key will not affect TransactionDB write-conflict // checking since there has already been a record returned for this key // in this snapshot. + // When ingest_behind is enabled, it's ok that we drop an overwritten + // Delete here. The overwritting key still covers whatever that will be + // ingested. Note that we will not drop SingleDelete here as SingleDelte + // is handled entirely in its own if clause. This is important, see + // example: from new to old: SingleDelete_1, PUT_1, SingleDelete_2, PUT_2, + // where all operations are on the same key and PUT_2 is ingested with + // ingest_behind=true. If SingleDelete_2 is dropped due to being compacted + // together with PUT_1, and then PUT_1 is compacted away together with + // SingleDelete_1, PUT_2 can incorrectly becomes visible. if (last_sequence < current_user_key_sequence_) { ROCKS_LOG_FATAL(info_log_, "key %s, last_sequence (%" PRIu64 @@ -887,12 +893,13 @@ void CompactionIterator::NextFromInput() { assert(false); } - ++iter_stats_.num_record_drop_hidden; // rule (A) + ++iter_stats_.num_record_drop_hidden; AdvanceInputIter(); } else if (compaction_ != nullptr && (ikey_.type == kTypeDeletion || (ikey_.type == kTypeDeletionWithTimestamp && cmp_with_history_ts_low_ < 0)) && + !compaction_->allow_ingest_behind() && DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, &level_ptrs_)) { @@ -928,11 +935,13 @@ void CompactionIterator::NextFromInput() { (ikey_.type == kTypeDeletionWithTimestamp && cmp_with_history_ts_low_ < 0)) && bottommost_level_) { + assert(compaction_); + assert(!compaction_->allow_ingest_behind()); // bottommost_level_ is true // Handle the case where we have a delete key at the bottom most level // We can skip outputting the key iff there are no subsequent puts for // this key - assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( - ikey_.user_key, &level_ptrs_)); + assert(compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)); ParsedInternalKey next_ikey; AdvanceInputIter(); #ifndef NDEBUG @@ -974,6 +983,12 @@ void CompactionIterator::NextFromInput() { (compaction_ != nullptr && compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, &level_ptrs_)))) { + // FIXME: it's possible that we are setting sequence number to 0 as + // preferred sequence number here. If cf_ingest_behind is enabled, this + // may fail ingestions since they expect all keys above the last level + // to have non-zero sequence number. We should probably not allow seqno + // zeroing here. + // // This section that attempts to swap preferred sequence number will not // be invoked if this is a CompactionIterator created for flush, since // `compaction_` will be nullptr and it's not bottommost either. @@ -1105,17 +1120,15 @@ void CompactionIterator::NextFromInput() { } } - if (!Valid() && IsShuttingDown()) { - status_ = Status::ShutdownInProgress(); - } - - if (IsPausingManualCompaction()) { - status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused); - } - - // Propagate corruption status from memtable itereator - if (!input_.Valid() && input_.status().IsCorruption()) { - status_ = input_.status(); + if (status_.ok()) { + if (!Valid() && IsShuttingDown()) { + status_ = Status::ShutdownInProgress(); + } else if (IsPausingManualCompaction()) { + status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } else if (!input_.Valid() && input_.status().IsCorruption()) { + // Propagate corruption status from memtable iterator + status_ = input_.status(); + } } } @@ -1274,11 +1287,11 @@ void CompactionIterator::PrepareOutput() { // // Can we do the same for levels above bottom level as long as // KeyNotExistsBeyondOutputLevel() return true? - if (Valid() && compaction_ != nullptr && - !compaction_->allow_ingest_behind() && bottommost_level_ && + if (Valid() && bottommost_level_ && DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && ikey_.type != kTypeMerge && current_key_committed_ && ikey_.sequence <= preserve_seqno_after_ && !is_range_del_) { + assert(compaction_ != nullptr && !compaction_->allow_ingest_behind()); if (ikey_.type == kTypeDeletion || (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) { ROCKS_LOG_FATAL( @@ -1297,14 +1310,14 @@ void CompactionIterator::PrepareOutput() { validity_info_.rep); assert(false); } - ikey_.sequence = 0; - last_key_seq_zeroed_ = true; - TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq", - &ikey_); + + bool zeroed_seqno = false; if (!timestamp_size_) { current_key_.UpdateInternalKey(0, ikey_.type); + zeroed_seqno = true; } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) { - // We can also zero out timestamp for better compression. + // For UDT, the seqno and timestamp could only be zeroed out after the + // key is below history_ts_low_. // For the same user key (excluding timestamp), the timestamp-based // history can be collapsed to save some space if the timestamp is // older than *full_history_ts_low_. @@ -1312,6 +1325,14 @@ void CompactionIterator::PrepareOutput() { const Slice ts_slice = kTsMin; ikey_.SetTimestamp(ts_slice); current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice); + zeroed_seqno = true; + } + + if (zeroed_seqno) { + ikey_.sequence = 0; + last_key_seq_zeroed_ = true; + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq", + &ikey_); } } } diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index c3e4942ac342..a851e35f93d5 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -145,7 +145,8 @@ class CompactionIterator { } bool allow_ingest_behind() const override { - return compaction_->immutable_options().allow_ingest_behind; + return compaction_->immutable_options().cf_allow_ingest_behind || + compaction_->immutable_options().allow_ingest_behind; } bool allow_mmap_reads() const override { @@ -182,17 +183,27 @@ class CompactionIterator { const Compaction* compaction_; }; - // @param must_count_input_entries if true, `NumInputEntryScanned()` will - // return the number of input keys scanned. If false, `NumInputEntryScanned()` - // will return this number if no Seek was called on `input`. User should call - // `HasNumInputEntryScanned()` first in this case. + // @param must_count_input_entries Controls input entry counting accuracy vs + // performance: + // - If true: `NumInputEntryScanned()` always returns the exact count of + // input keys + // scanned. The iterator will use sequential `Next()` calls instead of + // `Seek()` to maintain count accuracy as `Seek()` will not count the + // skipped input entries, which is slower but guarantees correctness. + // - If false: `NumInputEntryScanned()` returns the count only if no + // `Seek()` operations + // were performed on the input iterator. When compaction filters request + // skipping ranges of keys or other optimizations trigger seek operations, + // the count becomes unreliable. Always call `HasNumInputEntryScanned()` + // first to verify if the count is accurate before using + // `NumInputEntryScanned()`. CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, std::vector* snapshots, SequenceNumber earliest_snapshot, SequenceNumber earliest_write_conflict_snapshot, SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, - Env* env, bool report_detailed_time, bool expect_valid_internal_key, + Env* env, bool report_detailed_time, CompactionRangeDelAggregator* range_del_agg, BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, bool enforce_single_del_contracts, @@ -212,7 +223,7 @@ class CompactionIterator { SequenceNumber earliest_write_conflict_snapshot, SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker, Env* env, - bool report_detailed_time, bool expect_valid_internal_key, + bool report_detailed_time, CompactionRangeDelAggregator* range_del_agg, BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, @@ -254,7 +265,21 @@ class CompactionIterator { } const CompactionIterationStats& iter_stats() const { return iter_stats_; } bool HasNumInputEntryScanned() const { return input_.HasNumItered(); } + + // This method should only be used when `HasNumInputEntryScanned()` returns + // true, unless `must_count_input_entries=true` was specified during iterator + // creation (which ensures the count is always accurate). uint64_t NumInputEntryScanned() const { return input_.NumItered(); } + + // Returns true if the current valid key was already scanned/counted during + // a lookahead operation in a previous iteration. + // + // REQUIRED: Valid() must be true + bool IsCurrentKeyAlreadyScanned() const { + assert(Valid()); + return at_next_ || merge_out_iter_.Valid(); + } + Status InputStatus() const { return input_.status(); } bool IsDeleteRangeSentinelKey() const { return is_range_del_; } @@ -347,7 +372,6 @@ class CompactionIterator { Env* env_; SystemClock* clock_; const bool report_detailed_time_; - const bool expect_valid_internal_key_; CompactionRangeDelAggregator* range_del_agg_; BlobFileBuilder* blob_file_builder_; std::unique_ptr compaction_; @@ -417,13 +441,15 @@ class CompactionIterator { // NextFromInput()). ParsedInternalKey ikey_; - // Stores whether ikey_.user_key is valid. If set to false, the user key is - // not compared against the current key in the underlying iterator. + // Stores whether current_user_key_ is valid. If so, current_user_key_ + // stores the user key of the last key seen by the iterator. + // If false, treat the next key to read as a new user key. bool has_current_user_key_ = false; // If false, the iterator holds a copy of the current compaction iterator // output (or current key in the underlying iterator during NextFromInput()). bool at_next_ = false; + // A copy of the current internal key. IterKey current_key_; Slice current_user_key_; std::string curr_ts_; @@ -433,8 +459,9 @@ class CompactionIterator { // True if the iterator has already returned a record for the current key. bool has_outputted_key_ = false; - // truncated the value of the next key and output it without applying any - // compaction rules. This is used for outputting a put after a single delete. + // Truncate the value of the next key and output it without applying any + // compaction rules. This is an optimization for outputting a put after + // a single delete. See more in `NextFromInput()` under Optimization 3. bool clear_and_output_next_key_ = false; MergeOutputIterator merge_out_iter_; diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index 974a4e1ff837..5ede0f4e1623 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -294,7 +294,7 @@ class CompactionIteratorTest : public testing::TestWithParam { snapshots_.empty() ? kMaxSequenceNumber : snapshots_.at(0), earliest_write_conflict_snapshot, kMaxSequenceNumber, snapshot_checker_.get(), Env::Default(), - false /* report_detailed_time */, false, range_del_agg_.get(), + false /* report_detailed_time */, range_del_agg_.get(), nullptr /* blob_file_builder */, true /*allow_data_in_errors*/, true /*enforce_single_del_contracts*/, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_, @@ -374,8 +374,7 @@ TEST_P(CompactionIteratorTest, EmptyResult) { ASSERT_FALSE(c_iter_->Valid()); } -// If there is a corruption after a single deletion, the corrupted key should -// be preserved. +// If there is a corruption after a single deletion, the compaction should fail. TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) { InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), test::KeyStr("a", 3, kTypeValue, true), @@ -386,14 +385,10 @@ TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) { ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion), c_iter_->key().ToString()); c_iter_->Next(); - ASSERT_TRUE(c_iter_->Valid()); - ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString()); - c_iter_->Next(); - ASSERT_TRUE(c_iter_->Valid()); - ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString()); - c_iter_->Next(); - ASSERT_OK(c_iter_->status()); + // The iterator should now fail when encountering the corrupted key ASSERT_FALSE(c_iter_->Valid()); + ASSERT_FALSE(c_iter_->status().ok()); + ASSERT_TRUE(c_iter_->status().IsCorruption()); } // Tests compatibility of TimedPut and SingleDelete. TimedPut should act as if diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 0ea74891e40d..8092a26069be 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -51,7 +51,9 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/utilities/options_type.h" +#include "table/format.h" #include "table/merging_iterator.h" +#include "table/meta_blocks.h" #include "table/table_builder.h" #include "table/unique_id_impl.h" #include "test_util/sync_point.h" @@ -109,16 +111,16 @@ const char* GetCompactionReasonString(CompactionReason compaction_reason) { } } -const char* GetCompactionPenultimateOutputRangeTypeString( - Compaction::PenultimateOutputRangeType range_type) { +const char* GetCompactionProximalOutputRangeTypeString( + Compaction::ProximalOutputRangeType range_type) { switch (range_type) { - case Compaction::PenultimateOutputRangeType::kNotSupported: + case Compaction::ProximalOutputRangeType::kNotSupported: return "NotSupported"; - case Compaction::PenultimateOutputRangeType::kFullRange: + case Compaction::ProximalOutputRangeType::kFullRange: return "FullRange"; - case Compaction::PenultimateOutputRangeType::kNonLastRange: + case Compaction::ProximalOutputRangeType::kNonLastRange: return "NonLastRange"; - case Compaction::PenultimateOutputRangeType::kDisabled: + case Compaction::ProximalOutputRangeType::kDisabled: return "Disabled"; default: assert(false); @@ -126,6 +128,10 @@ const char* GetCompactionPenultimateOutputRangeTypeString( } } +// Static constant for compaction abort flag - always false, used for +// compaction service jobs that don't support abort signaling +const std::atomic CompactionJob::kCompactionAbortedFalse{0}; + CompactionJob::CompactionJob( int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, const MutableDBOptions& mutable_db_options, const FileOptions& file_options, @@ -133,21 +139,18 @@ CompactionJob::CompactionJob( LogBuffer* log_buffer, FSDirectory* db_directory, FSDirectory* output_directory, FSDirectory* blob_output_directory, Statistics* stats, InstrumentedMutex* db_mutex, - ErrorHandler* db_error_handler, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, JobContext* job_context, + ErrorHandler* db_error_handler, JobContext* job_context, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, CompactionJobStats* compaction_job_stats, Env::Priority thread_pri, const std::shared_ptr& io_tracer, const std::atomic& manual_compaction_canceled, - const std::string& db_id, const std::string& db_session_id, - std::string full_history_ts_low, std::string trim_ts, - BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled, - int* bg_bottom_compaction_scheduled) + const std::atomic& compaction_aborted, const std::string& db_id, + const std::string& db_session_id, std::string full_history_ts_low, + std::string trim_ts, BlobFileCompletionCallback* blob_callback, + int* bg_compaction_scheduled, int* bg_bottom_compaction_scheduled) : compact_(new CompactionState(compaction)), - compaction_stats_(compaction->compaction_reason(), 1), + internal_stats_(compaction->compaction_reason(), 1), db_options_(db_options), mutable_db_options_copy_(mutable_db_options), log_buffer_(log_buffer), @@ -155,7 +158,7 @@ CompactionJob::CompactionJob( stats_(stats), bottommost_level_(false), write_hint_(Env::WLTH_NOT_SET), - compaction_job_stats_(compaction_job_stats), + job_stats_(compaction_job_stats), job_id_(job_id), dbname_(dbname), db_id_(db_id), @@ -169,16 +172,16 @@ CompactionJob::CompactionJob( versions_(versions), shutting_down_(shutting_down), manual_compaction_canceled_(manual_compaction_canceled), + compaction_aborted_(compaction_aborted), db_directory_(db_directory), blob_output_directory_(blob_output_directory), db_mutex_(db_mutex), db_error_handler_(db_error_handler), - existing_snapshots_(std::move(existing_snapshots)), - earliest_snapshot_(existing_snapshots_.empty() - ? kMaxSequenceNumber - : existing_snapshots_.at(0)), - earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), - snapshot_checker_(snapshot_checker), + // job_context cannot be nullptr, but we will assert later in the body of + // the constructor. + earliest_snapshot_(job_context + ? job_context->GetEarliestSnapshotSequence() + : kMaxSequenceNumber), job_context_(job_context), table_cache_(std::move(table_cache)), event_logger_(event_logger), @@ -191,8 +194,10 @@ CompactionJob::CompactionJob( extra_num_subcompaction_threads_reserved_(0), bg_compaction_scheduled_(bg_compaction_scheduled), bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) { - assert(compaction_job_stats_ != nullptr); + assert(job_stats_ != nullptr); assert(log_buffer_ != nullptr); + assert(job_context); + assert(job_context->snapshot_context_initialized); const auto* cfd = compact_->compaction->column_family_data(); ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); @@ -224,10 +229,9 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) { ThreadStatus::COMPACTION_PROP_FLAGS, compaction->is_manual_compaction() + (compaction->deletion_compaction() << 1)); - + auto total_input_bytes = compaction->CalculateTotalInputSize(); ThreadStatusUtil::SetThreadOperationProperty( - ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, - compaction->CalculateTotalInputSize()); + ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, total_input_bytes); IOSTATS_RESET(bytes_written); IOSTATS_RESET(bytes_read); @@ -240,14 +244,25 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) { // to ensure GetThreadList() can always show them all together. ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); - compaction_job_stats_->is_manual_compaction = - compaction->is_manual_compaction(); - compaction_job_stats_->is_full_compaction = compaction->is_full_compaction(); + job_stats_->is_manual_compaction = compaction->is_manual_compaction(); + job_stats_->is_full_compaction = compaction->is_full_compaction(); + // populate compaction stats num_input_files and total_num_of_bytes + size_t num_input_files = 0; + for (int input_level = 0; + input_level < static_cast(compaction->num_input_levels()); + ++input_level) { + const LevelFilesBrief* flevel = compaction->input_levels(input_level); + num_input_files += flevel->num_files; + } + job_stats_->CompactionJobStats::num_input_files = num_input_files; + job_stats_->total_input_bytes = total_input_bytes; } void CompactionJob::Prepare( std::optional, std::optional>> - known_single_subcompact) { + known_single_subcompact, + const CompactionProgress& compaction_progress, + log::Writer* compaction_progress_writer) { db_mutex_->AssertHeld(); AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_PREPARE); @@ -260,7 +275,8 @@ void CompactionJob::Prepare( assert(storage_info); assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0); - write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level()); + write_hint_ = storage_info->CalculateSSTWriteHint( + c->output_level(), db_options_.calculate_sst_write_lifetime_hint_set); bottommost_level_ = c->bottommost_level(); if (!known_single_subcompact.has_value() && c->ShouldFormSubcompactions()) { @@ -296,13 +312,15 @@ void CompactionJob::Prepare( /*sub_job_id*/ 0); } + MaybeAssignCompactionProgressAndWriter(compaction_progress, + compaction_progress_writer); + // collect all seqno->time information from the input files which will be used // to encode seqno->time to the output files. SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber; SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber; uint64_t preserve_time_duration = - std::max(c->mutable_cf_options().preserve_internal_time_seconds, - c->mutable_cf_options().preclude_last_level_data_seconds); + MinAndMaxPreserveSeconds(c->mutable_cf_options()).max_preserve_seconds; if (preserve_time_duration > 0) { const ReadOptions read_options(Env::IOActivity::kCompaction); @@ -379,8 +397,8 @@ void CompactionJob::Prepare( } // Now combine what we would like to preclude from last level with what we // can safely support without dangerously moving data back up the LSM tree, - // to get the final seqno threshold for penultimate vs. last. In particular, - // when the reserved output key range for the penultimate level does not + // to get the final seqno threshold for proximal vs. last. In particular, + // when the reserved output key range for the proximal level does not // include the entire last level input key range, we need to keep entries // already in the last level there. (Even allowing within-range entries to // move back up could cause problems with range tombstones. Perhaps it @@ -389,12 +407,31 @@ void CompactionJob::Prepare( // tracking and complexity to CompactionIterator that is probably not // worthwhile overall. Correctness is also more clear when splitting by // seqno threshold.) - penultimate_after_seqno_ = std::max(preclude_last_level_min_seqno, - c->GetKeepInLastLevelThroughSeqno()); + proximal_after_seqno_ = std::max(preclude_last_level_min_seqno, + c->GetKeepInLastLevelThroughSeqno()); options_file_number_ = versions_->options_file_number(); } +void CompactionJob::MaybeAssignCompactionProgressAndWriter( + const CompactionProgress& compaction_progress, + log::Writer* compaction_progress_writer) { + // LIMITATION: Only supports resuming single subcompaction for now + if (compact_->sub_compact_states.size() != 1) { + return; + } + + if (!compaction_progress.empty()) { + assert(compaction_progress.size() == 1); + SubcompactionState* sub_compact = &compact_->sub_compact_states[0]; + const SubcompactionProgress& subcompaction_progress = + compaction_progress[0]; + sub_compact->SetSubcompactionProgress(subcompaction_progress); + } + + compaction_progress_writer_ = compaction_progress_writer; +} + uint64_t CompactionJob::GetSubcompactionsLimit() { return extra_num_subcompaction_threads_reserved_ + std::max( @@ -667,16 +704,18 @@ void CompactionJob::GenSubcompactionBoundaries() { extra_num_subcompaction_threads_reserved_)); } -Status CompactionJob::Run() { +void CompactionJob::InitializeCompactionRun() { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_RUN); TEST_SYNC_POINT("CompactionJob::Run():Start"); log_buffer_->FlushBufferToLog(); LogCompaction(); +} +void CompactionJob::RunSubcompactions() { + TEST_SYNC_POINT("CompactionJob::RunSubcompactions:BeforeStart"); const size_t num_threads = compact_->sub_compact_states.size(); assert(num_threads > 0); - const uint64_t start_micros = db_options_.clock->NowMicros(); compact_->compaction->GetOrInitInputTableProperties(); // Launch a thread for each of subcompactions 1...num_threads-1 @@ -695,25 +734,108 @@ Status CompactionJob::Run() { for (auto& thread : thread_pool) { thread.join(); } + RemoveEmptyOutputs(); + + ReleaseSubcompactionResources(); + TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources"); +} - compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros); +void CompactionJob::UpdateTimingStats(uint64_t start_micros) { + internal_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros); for (auto& state : compact_->sub_compact_states) { - compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros); - state.RemoveLastEmptyOutput(); + internal_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros); } RecordTimeToHistogram(stats_, COMPACTION_TIME, - compaction_stats_.stats.micros); + internal_stats_.output_level_stats.micros); RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, - compaction_stats_.stats.cpu_micros); + internal_stats_.output_level_stats.cpu_micros); +} - TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); +void CompactionJob::RemoveEmptyOutputs() { + for (auto& state : compact_->sub_compact_states) { + state.RemoveLastEmptyOutput(); + } +} + +void CompactionJob::CleanupAbortedSubcompactions() { + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); - // Check if any thread encountered an error during execution + uint64_t total_sst_files_deleted = 0; + uint64_t total_blob_files_deleted = 0; + + // Track the first file deletion error to report at the end + Status first_error; + int deletion_errors = 0; + + // Mark all subcompactions as aborted and delete their output files + for (auto& sub_compact : compact_->sub_compact_states) { + // Mark this subcompaction as aborted + sub_compact.status = + Status::Incomplete(Status::SubCode::kCompactionAborted); + + // Delete all files (SST and blob) tracked during compaction. + // GetOutputFilePaths() contains ALL file paths created, including + // in-progress files that may have been removed from outputs_ or + // blob_file_additions_. + for (const bool is_proximal_level : {false, true}) { + if (is_proximal_level && + !compact_->compaction->SupportsPerKeyPlacement()) { + continue; + } + for (const std::string& file_path : + sub_compact.Outputs(is_proximal_level)->GetOutputFilePaths()) { + Status s = env_->DeleteFile(file_path); + if (s.ok()) { + // Count SST vs blob files by checking extension + if (file_path.find(".sst") != std::string::npos) { + total_sst_files_deleted++; + } else if (file_path.find(".blob") != std::string::npos) { + total_blob_files_deleted++; + } + } else if (!s.IsNotFound()) { + if (first_error.ok()) { + first_error = s; + } + deletion_errors++; + } + } + } + sub_compact.CleanupOutputs(); + } + + if (stats_) { + RecordTick(stats_, COMPACTION_ABORTED); + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Compaction aborted: deleted %" PRIu64 + " SST files and %" PRIu64 " blob files", + cfd->GetName().c_str(), job_id_, total_sst_files_deleted, + total_blob_files_deleted); + + if (!first_error.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "[%s] [JOB %d] Cleanup completed with %d file deletion " + "errors. First error: %s", + cfd->GetName().c_str(), job_id_, deletion_errors, + first_error.ToString().c_str()); + } +} + +bool CompactionJob::HasNewBlobFiles() const { + for (const auto& state : compact_->sub_compact_states) { + if (state.Current().HasBlobFileAdditions()) { + return true; + } + } + return false; +} + +Status CompactionJob::CollectSubcompactionErrors() { Status status; IOStatus io_s; - bool wrote_new_blob_files = false; for (const auto& state : compact_->sub_compact_states) { if (!state.status.ok()) { @@ -721,127 +843,161 @@ Status CompactionJob::Run() { io_s = state.io_status; break; } - - if (state.Current().HasBlobFileAdditions()) { - wrote_new_blob_files = true; - } } if (io_status_.ok()) { io_status_ = io_s; } - if (status.ok()) { - constexpr IODebugContext* dbg = nullptr; - if (output_directory_) { - io_s = output_directory_->FsyncWithDirOptions( - IOOptions(), dbg, - DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); - } + return status; +} - if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ && - blob_output_directory_ != output_directory_) { - io_s = blob_output_directory_->FsyncWithDirOptions( - IOOptions(), dbg, - DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); - } +Status CompactionJob::SyncOutputDirectories() { + Status status; + IOStatus io_s; + constexpr IODebugContext* dbg = nullptr; + const bool wrote_new_blob_files = HasNewBlobFiles(); + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + + if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ && + blob_output_directory_ != output_directory_) { + io_s = blob_output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } + if (io_status_.ok()) { io_status_ = io_s; } if (status.ok()) { status = io_s; } - if (status.ok()) { - thread_pool.clear(); - std::vector files_output; - for (const auto& state : compact_->sub_compact_states) { - for (const auto& output : state.GetOutputs()) { - files_output.emplace_back(&output); - } - } - ColumnFamilyData* cfd = compact_->compaction->column_family_data(); - std::atomic next_file_idx(0); - auto verify_table = [&](Status& output_status) { - while (true) { - size_t file_idx = next_file_idx.fetch_add(1); - if (file_idx >= files_output.size()) { - break; - } - // Verify that the table is usable - // We set for_compaction to false and don't - // OptimizeForCompactionTableRead here because this is a special case - // after we finish the table building No matter whether - // use_direct_io_for_flush_and_compaction is true, we will regard this - // verification as user reads since the goal is to cache it here for - // further user reads - ReadOptions verify_table_read_options(Env::IOActivity::kCompaction); - verify_table_read_options.rate_limiter_priority = - GetRateLimiterPriority(); - InternalIterator* iter = cfd->table_cache()->NewIterator( - verify_table_read_options, file_options_, - cfd->internal_comparator(), files_output[file_idx]->meta, - /*range_del_agg=*/nullptr, - compact_->compaction->mutable_cf_options(), - /*table_reader_ptr=*/nullptr, - cfd->internal_stats()->GetFileReadHist( - compact_->compaction->output_level()), - TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, - /*skip_filters=*/false, compact_->compaction->output_level(), - MaxFileSizeForL0MetaPin(compact_->compaction->mutable_cf_options()), - /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false); - auto s = iter->status(); - - if (s.ok() && paranoid_file_checks_) { - OutputValidator validator(cfd->internal_comparator(), - /*_enable_hash=*/true); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - s = validator.Add(iter->key(), iter->value()); - if (!s.ok()) { - break; - } - } - if (s.ok()) { - s = iter->status(); + + return status; +} + +Status CompactionJob::VerifyOutputFiles() { + Status status; + std::vector thread_pool; + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + VerifyOutputFlags verify_output_flags = + compact_->compaction->mutable_cf_options().verify_output_flags; + + // For backward compatibility + if (paranoid_file_checks_) { + verify_output_flags |= VerifyOutputFlags::kVerifyIteration; + verify_output_flags |= VerifyOutputFlags::kEnableForLocalCompaction; + verify_output_flags |= VerifyOutputFlags::kEnableForRemoteCompaction; + } + + auto verify_table = [&](SubcompactionState& subcompaction_state) { + for (const auto& output_file : subcompaction_state.GetOutputs()) { + // Verify that the table is usable + // We set for_compaction to false and don't + // OptimizeForCompactionTableRead here because this is a special case + // after we finish the table building No matter whether + // use_direct_io_for_flush_and_compaction is true, we will regard this + // verification as user reads since the goal is to cache it here for + // further user reads + ReadOptions verify_table_read_options(Env::IOActivity::kCompaction); + verify_table_read_options.verify_checksums = true; + verify_table_read_options.readahead_size = + file_options_for_read_.compaction_readahead_size; + + std::unique_ptr table_reader_guard; + TableReader* table_reader_ptr = table_reader_guard.get(); + verify_table_read_options.rate_limiter_priority = + GetRateLimiterPriority(); + InternalIterator* iter = cfd->table_cache()->NewIterator( + verify_table_read_options, file_options_, cfd->internal_comparator(), + output_file.meta, + /*range_del_agg=*/nullptr, compact_->compaction->mutable_cf_options(), + /*table_reader_ptr=*/&table_reader_ptr, + cfd->internal_stats()->GetFileReadHist( + compact_->compaction->output_level()), + TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, + /*skip_filters=*/false, compact_->compaction->output_level(), + MaxFileSizeForL0MetaPin(compact_->compaction->mutable_cf_options()), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false); + auto s = iter->status(); + if (s.ok()) { + // Check for remote/local compaction and verify_output_flags flags + const bool should_verify = + (subcompaction_state.compaction_job_stats.is_remote_compaction && + !!(verify_output_flags & + VerifyOutputFlags::kEnableForRemoteCompaction)) || + (!subcompaction_state.compaction_job_stats.is_remote_compaction && + !!(verify_output_flags & + VerifyOutputFlags::kEnableForLocalCompaction)); + + if (should_verify) { + const bool should_verify_block_checksum = + !!(verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum); + const bool should_verify_iteration = + !!(verify_output_flags & VerifyOutputFlags::kVerifyIteration); + if (should_verify_block_checksum) { + assert(table_reader_ptr != nullptr); + // If verifying iteration as well, verify meta blocks here only to + // avoid redundant checks on data blocks + s = table_reader_ptr->VerifyChecksum( + verify_table_read_options, TableReaderCaller::kCompaction, + /*meta_blocks_only=*/should_verify_iteration); } - if (s.ok() && - !validator.CompareValidator(files_output[file_idx]->validator)) { - s = Status::Corruption("Paranoid checksums do not match"); + if (s.ok() && should_verify_iteration) { + OutputValidator validator(cfd->internal_comparator(), + /*_enable_hash=*/true); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = validator.Add(iter->key(), iter->value()); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + s = iter->status(); + } + if (s.ok() && !validator.CompareValidator(output_file.validator)) { + s = Status::Corruption( + "Key-value checksum of compaction output doesn't match what " + "was computed when written"); + } } } + } - delete iter; + delete iter; - if (!s.ok()) { - output_status = s; - break; - } + if (!s.ok()) { + subcompaction_state.status = s; + break; } - }; - for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { - thread_pool.emplace_back( - verify_table, std::ref(compact_->sub_compact_states[i].status)); - } - verify_table(compact_->sub_compact_states[0].status); - for (auto& thread : thread_pool) { - thread.join(); } + }; + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back(verify_table, + std::ref(compact_->sub_compact_states[i])); + } + verify_table(compact_->sub_compact_states[0]); + for (auto& thread : thread_pool) { + thread.join(); + } - for (const auto& state : compact_->sub_compact_states) { - if (!state.status.ok()) { - status = state.status; - break; - } + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + break; } } - ReleaseSubcompactionResources(); - TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0"); - TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1"); + return status; +} - TablePropertiesCollection tp; +void CompactionJob::SetOutputTableProperties() { for (const auto& state : compact_->sub_compact_states) { for (const auto& output : state.GetOutputs()) { auto fn = @@ -851,56 +1007,109 @@ Status CompactionJob::Run() { output.table_properties); } } +} +void CompactionJob::AggregateSubcompactionOutputAndJobStats() { // Before the compaction starts, is_remote_compaction was set to true if // compaction_service is set. We now know whether each sub_compaction was // done remotely or not. Reset is_remote_compaction back to false and allow // AggregateCompactionStats() to set the right value. - compaction_job_stats_->is_remote_compaction = false; + job_stats_->is_remote_compaction = false; // Finish up all bookkeeping to unify the subcompaction results. - compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); - uint64_t num_input_range_del = 0; - bool ok = UpdateCompactionStats(&num_input_range_del); - // (Sub)compactions returned ok, do sanity check on the number of input keys. - if (status.ok() && ok && compaction_job_stats_->has_num_input_records) { - size_t ts_sz = compact_->compaction->column_family_data() - ->user_comparator() - ->timestamp_size(); - // When trim_ts_ is non-empty, CompactionIterator takes - // HistoryTrimmingIterator as input iterator and sees a trimmed view of - // input keys. So the number of keys it processed is not suitable for - // verification here. - // TODO: support verification when trim_ts_ is non-empty. - if (!(ts_sz > 0 && !trim_ts_.empty())) { - assert(compaction_stats_.stats.num_input_records > 0); - // TODO: verify the number of range deletion entries. - uint64_t expected = - compaction_stats_.stats.num_input_records - num_input_range_del; - uint64_t actual = compaction_job_stats_->num_input_records; - if (expected != actual) { - char scratch[2345]; - compact_->compaction->Summary(scratch, sizeof(scratch)); - std::string msg = - "Compaction number of input keys does not match " - "number of keys processed. Expected " + - std::to_string(expected) + " but processed " + - std::to_string(actual) + ". Compaction summary: " + scratch; - ROCKS_LOG_WARN( - db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s", - compact_->compaction->column_family_data()->GetName().c_str(), - job_context_->job_id, msg.c_str()); - if (db_options_.compaction_verify_record_count) { - status = Status::Corruption(msg); - } - } + compact_->AggregateCompactionStats(internal_stats_, *job_stats_); +} + +Status CompactionJob::VerifyCompactionRecordCounts( + bool stats_built_from_input_table_prop, uint64_t num_input_range_del) { + Status status; + if (stats_built_from_input_table_prop && + job_stats_->has_accurate_num_input_records) { + status = VerifyInputRecordCount(num_input_range_del); + if (!status.ok()) { + return status; + } + } + + const auto& mutable_cf_options = compact_->compaction->mutable_cf_options(); + if ((mutable_cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName()) || + mutable_cf_options.table_factory->IsInstanceOf( + TableFactory::kPlainTableName()))) { + status = VerifyOutputRecordCount(); + if (!status.ok()) { + return status; } } + return status; +} + +void CompactionJob::FinalizeCompactionRun( + const Status& input_status, bool stats_built_from_input_table_prop, + uint64_t num_input_range_del) { + if (stats_built_from_input_table_prop) { + UpdateCompactionJobInputStatsFromInternalStats(internal_stats_, + num_input_range_del); + } + UpdateCompactionJobOutputStatsFromInternalStats(input_status, + internal_stats_); RecordCompactionIOStats(); + LogFlush(db_options_.info_log); TEST_SYNC_POINT("CompactionJob::Run():End"); - compact_->status = status; - TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", &status); + compact_->status = input_status; + TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", + const_cast(&input_status)); +} + +Status CompactionJob::Run() { + InitializeCompactionRun(); + + const uint64_t start_micros = db_options_.clock->NowMicros(); + + RunSubcompactions(); + + UpdateTimingStats(start_micros); + + TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); + + Status status = CollectSubcompactionErrors(); + + // If compaction was aborted or manually paused, clean up any output files + // from completed subcompactions to prevent orphaned files on disk. + // Skip cleanup for resumable compaction (when progress writer is set) + // because the output files are needed for resumption. + if ((status.IsCompactionAborted() || status.IsManualCompactionPaused()) && + compaction_progress_writer_ == nullptr) { + CleanupAbortedSubcompactions(); + } + + if (status.ok()) { + status = SyncOutputDirectories(); + } + + if (status.ok()) { + status = VerifyOutputFiles(); + } + + if (status.ok()) { + SetOutputTableProperties(); + } + + AggregateSubcompactionOutputAndJobStats(); + + uint64_t num_input_range_del = 0; + bool stats_built_from_input_table_prop = + UpdateInternalStatsFromInputFiles(&num_input_range_del); + + if (status.ok()) { + status = VerifyCompactionRecordCounts(stats_built_from_input_table_prop, + num_input_range_del); + } + + FinalizeCompactionRun(status, stats_built_from_input_table_prop, + num_input_range_del); + return status; } @@ -917,7 +1126,7 @@ Status CompactionJob::Install(bool* compaction_released) { int output_level = compact_->compaction->output_level(); cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_, - compaction_stats_); + internal_stats_); if (status.ok()) { status = InstallCompactionResults(compaction_released); @@ -928,7 +1137,7 @@ Status CompactionJob::Install(bool* compaction_released) { VersionStorageInfo::LevelSummaryStorage tmp; auto vstorage = cfd->current()->storage_info(); - const auto& stats = compaction_stats_.stats; + const auto& stats = internal_stats_.output_level_stats; double read_write_amp = 0.0; double write_amp = 0.0; @@ -994,19 +1203,20 @@ Status CompactionJob::Install(bool* compaction_released) { blob_files.back()->GetBlobFileNumber()); } - if (compaction_stats_.has_penultimate_level_output) { - ROCKS_LOG_BUFFER( - log_buffer_, - "[%s] has Penultimate Level output: %" PRIu64 - ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64, - column_family_name.c_str(), - compaction_stats_.penultimate_level_stats.bytes_written, - compact_->compaction->GetPenultimateLevel(), - compaction_stats_.penultimate_level_stats.num_output_files, - compaction_stats_.penultimate_level_stats.num_output_records); + if (internal_stats_.has_proximal_level_output) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] has Proximal Level output: %" PRIu64 + ", level %d, number of files: %" PRIu64 + ", number of records: %" PRIu64, + column_family_name.c_str(), + internal_stats_.proximal_level_stats.bytes_written, + compact_->compaction->GetProximalLevel(), + internal_stats_.proximal_level_stats.num_output_files, + internal_stats_.proximal_level_stats.num_output_records); } - UpdateCompactionJobStats(stats); + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Install:AfterUpdateCompactionJobStats", job_stats_); auto stream = event_logger_->LogToBuffer(log_buffer_, 8192); stream << "job" << job_id_ << "event" << "compaction_finished" @@ -1028,17 +1238,16 @@ Status CompactionJob::Install(bool* compaction_released) { << CompressionTypeToString(compact_->compaction->output_compression()); stream << "num_single_delete_mismatches" - << compaction_job_stats_->num_single_del_mismatch; + << job_stats_->num_single_del_mismatch; stream << "num_single_delete_fallthrough" - << compaction_job_stats_->num_single_del_fallthru; + << job_stats_->num_single_del_fallthru; if (measure_io_stats_) { - stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos; - stream << "file_range_sync_nanos" - << compaction_job_stats_->file_range_sync_nanos; - stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos; + stream << "file_write_nanos" << job_stats_->file_write_nanos; + stream << "file_range_sync_nanos" << job_stats_->file_range_sync_nanos; + stream << "file_fsync_nanos" << job_stats_->file_fsync_nanos; stream << "file_prepare_write_nanos" - << compaction_job_stats_->file_prepare_write_nanos; + << job_stats_->file_prepare_write_nanos; } stream << "lsm_state"; @@ -1056,16 +1265,16 @@ Status CompactionJob::Install(bool* compaction_released) { stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber(); } - if (compaction_stats_.has_penultimate_level_output) { + if (internal_stats_.has_proximal_level_output) { InternalStats::CompactionStats& pl_stats = - compaction_stats_.penultimate_level_stats; - stream << "penultimate_level_num_output_files" << pl_stats.num_output_files; - stream << "penultimate_level_bytes_written" << pl_stats.bytes_written; - stream << "penultimate_level_num_output_records" + internal_stats_.proximal_level_stats; + stream << "proximal_level_num_output_files" << pl_stats.num_output_files; + stream << "proximal_level_bytes_written" << pl_stats.bytes_written; + stream << "proximal_level_num_output_records" << pl_stats.num_output_records; - stream << "penultimate_level_num_output_files_blob" + stream << "proximal_level_num_output_files_blob" << pl_stats.num_output_files_blob; - stream << "penultimate_level_bytes_written_blob" + stream << "proximal_level_bytes_written_blob" << pl_stats.bytes_written_blob; } @@ -1124,59 +1333,62 @@ void CompactionJob::NotifyOnSubcompactionCompleted( } } -void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { - assert(sub_compact); - assert(sub_compact->compaction); +bool CompactionJob::ShouldUseLocalCompaction(SubcompactionState* sub_compact) { if (db_options_.compaction_service) { CompactionServiceJobStatus comp_status = ProcessKeyValueCompactionWithCompactionService(sub_compact); - if (comp_status == CompactionServiceJobStatus::kSuccess || - comp_status == CompactionServiceJobStatus::kFailure) { - return; + if (comp_status != CompactionServiceJobStatus::kUseLocal) { + return false; } // fallback to local compaction assert(comp_status == CompactionServiceJobStatus::kUseLocal); sub_compact->compaction_job_stats.is_remote_compaction = false; } + return true; +} - uint64_t prev_cpu_micros = db_options_.clock->CPUMicros(); +CompactionJob::CompactionIOStatsSnapshot CompactionJob::InitializeIOStats() { + CompactionIOStatsSnapshot io_stats; - ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + if (measure_io_stats_) { + io_stats.prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + io_stats.prev_write_nanos = IOSTATS(write_nanos); + io_stats.prev_fsync_nanos = IOSTATS(fsync_nanos); + io_stats.prev_range_sync_nanos = IOSTATS(range_sync_nanos); + io_stats.prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + io_stats.prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + io_stats.prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + + return io_stats; +} + +Status CompactionJob::SetupAndValidateCompactionFilter( + SubcompactionState* sub_compact, + const CompactionFilter* configured_compaction_filter, + const CompactionFilter*& compaction_filter, + std::unique_ptr& compaction_filter_from_factory) { + compaction_filter = configured_compaction_filter; - // Create compaction filter and fail the compaction if - // IgnoreSnapshots() = false because it is not supported anymore - const CompactionFilter* compaction_filter = cfd->ioptions().compaction_filter; - std::unique_ptr compaction_filter_from_factory = nullptr; if (compaction_filter == nullptr) { compaction_filter_from_factory = sub_compact->compaction->CreateCompactionFilter(); compaction_filter = compaction_filter_from_factory.get(); } + if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) { - sub_compact->status = Status::NotSupported( + return Status::NotSupported( "CompactionFilter::IgnoreSnapshots() = false is not supported " "anymore."); - return; } - NotifyOnSubcompactionBegin(sub_compact); - - // This is assigned after creation of SubcompactionState to simplify that - // creation across both CompactionJob and CompactionServiceCompactionJob - sub_compact->AssignRangeDelAggregator( - std::make_unique( - &cfd->internal_comparator(), existing_snapshots_, - &full_history_ts_low_, &trim_ts_)); - - // TODO: since we already use C++17, should use - // std::optional instead. - const std::optional start = sub_compact->start; - const std::optional end = sub_compact->end; - - std::optional start_without_ts; - std::optional end_without_ts; + return Status::OK(); +} - ReadOptions read_options; +void CompactionJob::InitializeReadOptionsAndBoundaries( + const size_t ts_sz, ReadOptions& read_options, + SubcompactionKeyBoundaries& boundaries) { read_options.verify_checksums = true; read_options.fill_cache = false; read_options.rate_limiter_priority = GetRateLimiterPriority(); @@ -1187,242 +1399,245 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. read_options.total_order_seek = true; - const WriteOptions write_options(Env::IOPriority::IO_LOW, - Env::IOActivity::kCompaction); - // Remove the timestamps from boundaries because boundaries created in // GenSubcompactionBoundaries doesn't strip away the timestamp. - size_t ts_sz = cfd->user_comparator()->timestamp_size(); - if (start.has_value()) { - read_options.iterate_lower_bound = &(*start); + if (boundaries.start.has_value()) { + read_options.iterate_lower_bound = &(*boundaries.start); if (ts_sz > 0) { - start_without_ts = StripTimestampFromUserKey(*start, ts_sz); - read_options.iterate_lower_bound = &(*start_without_ts); + boundaries.start_without_ts = + StripTimestampFromUserKey(*boundaries.start, ts_sz); + read_options.iterate_lower_bound = &(*boundaries.start_without_ts); } } - if (end.has_value()) { - read_options.iterate_upper_bound = &(*end); + if (boundaries.end.has_value()) { + read_options.iterate_upper_bound = &(*boundaries.end); if (ts_sz > 0) { - end_without_ts = StripTimestampFromUserKey(*end, ts_sz); - read_options.iterate_upper_bound = &(*end_without_ts); + boundaries.end_without_ts = + StripTimestampFromUserKey(*boundaries.end, ts_sz); + read_options.iterate_upper_bound = &(*boundaries.end_without_ts); } } - // Although the v2 aggregator is what the level iterator(s) know about, - // the AddTombstones calls will be propagated down to the v1 aggregator. - std::unique_ptr raw_input(versions_->MakeInputIterator( - read_options, sub_compact->compaction, sub_compact->RangeDelAgg(), - file_options_for_read_, start, end)); - InternalIterator* input = raw_input.get(); - - IterKey start_ikey; - IterKey end_ikey; - Slice start_slice; - Slice end_slice; - Slice start_user_key{}; - Slice end_user_key{}; - - static constexpr char kMaxTs[] = - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"; - Slice ts_slice; - std::string max_ts; if (ts_sz > 0) { - if (ts_sz <= strlen(kMaxTs)) { - ts_slice = Slice(kMaxTs, ts_sz); + if (ts_sz <= strlen(boundaries.kMaxTs)) { + boundaries.ts_slice = Slice(boundaries.kMaxTs, ts_sz); } else { - max_ts = std::string(ts_sz, '\xff'); - ts_slice = Slice(max_ts); + boundaries.max_ts = std::string(ts_sz, '\xff'); + boundaries.ts_slice = Slice(boundaries.max_ts); } } - - if (start.has_value()) { - start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek); + if (boundaries.start.has_value()) { + boundaries.start_ikey.SetInternalKey(*boundaries.start, kMaxSequenceNumber, + kValueTypeForSeek); if (ts_sz > 0) { - start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, - &ts_slice); + boundaries.start_ikey.UpdateInternalKey( + kMaxSequenceNumber, kValueTypeForSeek, &boundaries.ts_slice); } - start_slice = start_ikey.GetInternalKey(); - start_user_key = start_ikey.GetUserKey(); + boundaries.start_internal_key = boundaries.start_ikey.GetInternalKey(); + boundaries.start_user_key = boundaries.start_ikey.GetUserKey(); } - if (end.has_value()) { - end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek); + if (boundaries.end.has_value()) { + boundaries.end_ikey.SetInternalKey(*boundaries.end, kMaxSequenceNumber, + kValueTypeForSeek); if (ts_sz > 0) { - end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, - &ts_slice); + boundaries.end_ikey.UpdateInternalKey( + kMaxSequenceNumber, kValueTypeForSeek, &boundaries.ts_slice); } - end_slice = end_ikey.GetInternalKey(); - end_user_key = end_ikey.GetUserKey(); + boundaries.end_internal_key = boundaries.end_ikey.GetInternalKey(); + boundaries.end_user_key = boundaries.end_ikey.GetUserKey(); } +} - std::unique_ptr clip; - if (start.has_value() || end.has_value()) { - clip = std::make_unique( - raw_input.get(), start.has_value() ? &start_slice : nullptr, - end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator()); - input = clip.get(); - } +InternalIterator* CompactionJob::CreateInputIterator( + SubcompactionState* sub_compact, ColumnFamilyData* cfd, + SubcompactionInternalIterators& iterators, + SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options) { + const size_t ts_sz = cfd->user_comparator()->timestamp_size(); + InitializeReadOptionsAndBoundaries(ts_sz, read_options, boundaries); + + // This is assigned after creation of SubcompactionState to simplify that + // creation across both CompactionJob and CompactionServiceCompactionJob + sub_compact->AssignRangeDelAggregator( + std::make_unique( + &cfd->internal_comparator(), job_context_->snapshot_seqs, + &full_history_ts_low_, &trim_ts_)); + + // Although the v2 aggregator is what the level iterator(s) know about, + // the AddTombstones calls will be propagated down to the v1 aggregator. + iterators.raw_input = + std::unique_ptr(versions_->MakeInputIterator( + read_options, sub_compact->compaction, sub_compact->RangeDelAgg(), + file_options_for_read_, boundaries.start, boundaries.end)); + InternalIterator* input = iterators.raw_input.get(); - std::unique_ptr blob_counter; + if (boundaries.start.has_value() || boundaries.end.has_value()) { + iterators.clip = std::make_unique( + iterators.raw_input.get(), + boundaries.start.has_value() ? &boundaries.start_internal_key : nullptr, + boundaries.end.has_value() ? &boundaries.end_internal_key : nullptr, + &cfd->internal_comparator()); + input = iterators.clip.get(); + } if (sub_compact->compaction->DoesInputReferenceBlobFiles()) { BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter(); - blob_counter = std::make_unique(input, meter); - input = blob_counter.get(); + iterators.blob_counter = + std::make_unique(input, meter); + input = iterators.blob_counter.get(); } - std::unique_ptr trim_history_iter; if (ts_sz > 0 && !trim_ts_.empty()) { - trim_history_iter = std::make_unique( + iterators.trim_history_iter = std::make_unique( input, cfd->user_comparator(), trim_ts_); - input = trim_history_iter.get(); + input = iterators.trim_history_iter.get(); } - input->SeekToFirst(); - - AutoThreadOperationStageUpdater stage_updater( - ThreadStatus::STAGE_COMPACTION_PROCESS_KV); - - // I/O measurement variables - PerfLevel prev_perf_level = PerfLevel::kEnableTime; - const uint64_t kRecordStatsEvery = 1000; - uint64_t prev_write_nanos = 0; - uint64_t prev_fsync_nanos = 0; - uint64_t prev_range_sync_nanos = 0; - uint64_t prev_prepare_write_nanos = 0; - uint64_t prev_cpu_write_nanos = 0; - uint64_t prev_cpu_read_nanos = 0; - if (measure_io_stats_) { - prev_perf_level = GetPerfLevel(); - SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); - prev_write_nanos = IOSTATS(write_nanos); - prev_fsync_nanos = IOSTATS(fsync_nanos); - prev_range_sync_nanos = IOSTATS(range_sync_nanos); - prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); - prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); - prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); - } - - MergeHelper merge( - env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(), - compaction_filter, db_options_.info_log.get(), - false /* internal key corruption is expected */, - existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), - snapshot_checker_, compact_->compaction->level(), db_options_.stats); + return input; +} +void CompactionJob::CreateBlobFileBuilder( + SubcompactionState* sub_compact, ColumnFamilyData* cfd, + std::unique_ptr& blob_file_builder, + const WriteOptions& write_options) { const auto& mutable_cf_options = sub_compact->compaction->mutable_cf_options(); - std::vector blob_file_paths; - - // TODO: BlobDB to support output_to_penultimate_level compaction, which needs + // TODO: BlobDB to support output_to_proximal_level compaction, which needs // 2 builders, so may need to move to `CompactionOutputs` - std::unique_ptr blob_file_builder( - (mutable_cf_options.enable_blob_files && - sub_compact->compaction->output_level() >= - mutable_cf_options.blob_file_starting_level) - ? new BlobFileBuilder( - versions_, fs_.get(), - &sub_compact->compaction->immutable_options(), - &mutable_cf_options, &file_options_, &write_options, db_id_, - db_session_id_, job_id_, cfd->GetID(), cfd->GetName(), - write_hint_, io_tracer_, blob_callback_, - BlobFileCreationReason::kCompaction, &blob_file_paths, - sub_compact->Current().GetBlobFileAdditionsPtr()) - : nullptr); + if (mutable_cf_options.enable_blob_files && + sub_compact->compaction->output_level() >= + mutable_cf_options.blob_file_starting_level) { + blob_file_builder = std::make_unique( + versions_, fs_.get(), &sub_compact->compaction->immutable_options(), + &mutable_cf_options, &file_options_, &write_options, db_id_, + db_session_id_, job_id_, cfd->GetID(), cfd->GetName(), write_hint_, + io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction, + sub_compact->Current().GetOutputFilePathsPtr(), + sub_compact->Current().GetBlobFileAdditionsPtr()); + } else { + blob_file_builder = nullptr; + } +} - TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); - TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1", - static_cast(const_cast*>( - &manual_compaction_canceled_))); +std::unique_ptr CompactionJob::CreateCompactionIterator( + SubcompactionState* sub_compact, ColumnFamilyData* cfd, + InternalIterator* input, const CompactionFilter* compaction_filter, + MergeHelper& merge, std::unique_ptr& blob_file_builder, + const WriteOptions& write_options) { + CreateBlobFileBuilder(sub_compact, cfd, blob_file_builder, write_options); const std::string* const full_history_ts_low = full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_; - const SequenceNumber job_snapshot_seq = - job_context_ ? job_context_->GetJobSnapshotSequence() - : kMaxSequenceNumber; + assert(job_context_); - auto c_iter = std::make_unique( + return std::make_unique( input, cfd->user_comparator(), &merge, versions_->LastSequence(), - &existing_snapshots_, earliest_snapshot_, - earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_, - env_, ShouldReportDetailedTime(env_, stats_), - /*expect_valid_internal_key=*/true, sub_compact->RangeDelAgg(), + &(job_context_->snapshot_seqs), earliest_snapshot_, + job_context_->earliest_write_conflict_snapshot, + job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker, + env_, ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(), blob_file_builder.get(), db_options_.allow_data_in_errors, db_options_.enforce_single_del_contracts, manual_compaction_canceled_, sub_compact->compaction ->DoesInputReferenceBlobFiles() /* must_count_input_entries */, sub_compact->compaction, compaction_filter, shutting_down_, db_options_.info_log, full_history_ts_low, preserve_seqno_after_); - c_iter->SeekToFirst(); - - const auto& c_iter_stats = c_iter->iter_stats(); +} - // define the open and close functions for the compaction files, which will be - // used open/close output files when needed. +std::pair +CompactionJob::CreateFileHandlers(SubcompactionState* sub_compact, + SubcompactionKeyBoundaries& boundaries) { const CompactionFileOpenFunc open_file_func = [this, sub_compact](CompactionOutputs& outputs) { return this->OpenCompactionOutputFile(sub_compact, outputs); }; + const Slice* start_user_key = + sub_compact->start.has_value() ? &boundaries.start_user_key : nullptr; + const Slice* end_user_key = + sub_compact->end.has_value() ? &boundaries.end_user_key : nullptr; + const CompactionFileCloseFunc close_file_func = [this, sub_compact, start_user_key, end_user_key]( - CompactionOutputs& outputs, const Status& status, - const Slice& next_table_min_key) { + const Status& status, + const ParsedInternalKey& prev_iter_output_internal_key, + const Slice& next_table_min_key, const CompactionIterator* c_iter, + CompactionOutputs& outputs) { return this->FinishCompactionOutputFile( - status, sub_compact, outputs, next_table_min_key, - sub_compact->start.has_value() ? &start_user_key : nullptr, - sub_compact->end.has_value() ? &end_user_key : nullptr); + status, prev_iter_output_internal_key, next_table_min_key, + start_user_key, end_user_key, c_iter, sub_compact, outputs); }; + return {open_file_func, close_file_func}; +} + +Status CompactionJob::ProcessKeyValue( + SubcompactionState* sub_compact, ColumnFamilyData* cfd, + CompactionIterator* c_iter, const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func, uint64_t& prev_cpu_micros) { + // Cron interval for periodic operations: stats update, abort check, + // and sync points. Uses 1024 (power of 2) for efficient bitwise check. + const uint64_t kCronEveryMask = (1 << 10) - 1; + [[maybe_unused]] const std::optional end = sub_compact->end; + + // Check for abort signal before starting key processing + if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kCompactionAborted); + } + Status status; + IterKey prev_iter_output_key; + ParsedInternalKey prev_iter_output_internal_key; + TEST_SYNC_POINT_CALLBACK( "CompactionJob::ProcessKeyValueCompaction()::Processing", static_cast(const_cast(sub_compact->compaction))); - uint64_t last_cpu_micros = prev_cpu_micros; - while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { - // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() - // returns true. + + while (status.ok() && !cfd->IsDropped() && c_iter->Valid() && + c_iter->status().ok()) { assert(!end.has_value() || cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0); - if (c_iter_stats.num_input_records % kRecordStatsEvery == - kRecordStatsEvery - 1) { - RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); - c_iter->ResetRecordCounts(); - RecordCompactionIOStats(); - - uint64_t cur_cpu_micros = db_options_.clock->CPUMicros(); - assert(cur_cpu_micros >= last_cpu_micros); - RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME, - cur_cpu_micros - last_cpu_micros); - last_cpu_micros = cur_cpu_micros; + const uint64_t num_records = c_iter->iter_stats().num_input_records; + + // Periodic cron operations: stats update, abort check. + if ((num_records & kCronEveryMask) == kCronEveryMask) { + // Check for abort signal periodically + if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + status = Status::Incomplete(Status::SubCode::kCompactionAborted); + break; + } + + UpdateSubcompactionJobStatsIncrementally( + c_iter, &sub_compact->compaction_job_stats, + db_options_.clock->CPUMicros(), prev_cpu_micros); } const auto& ikey = c_iter->ikey(); - bool use_penultimate_output = ikey.sequence > penultimate_after_seqno_; + bool use_proximal_output = ikey.sequence > proximal_after_seqno_; + #ifndef NDEBUG if (sub_compact->compaction->SupportsPerKeyPlacement()) { - // Could be overridden by unittest PerKeyPlacementContext context(sub_compact->compaction->output_level(), ikey.user_key, c_iter->value(), - ikey.sequence, use_penultimate_output); + ikey.sequence, use_proximal_output); TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", &context); - if (use_penultimate_output) { - // Verify that entries sent to the penultimate level are within the + if (use_proximal_output) { + // Verify that entries sent to the proximal level are within the // allowed range (because the input key range of the last level could - // be larger than the allowed output key range of the penultimate + // be larger than the allowed output key range of the proximal // level). This check uses user keys (ignores sequence numbers) because // compaction boundaries are a "clean cut" between user keys (see // CompactionPicker::ExpandInputsToCleanCut()), which is especially // important when preferred sequence numbers has been swapped in for // kTypeValuePreferredSeqno / TimedPut. - sub_compact->compaction->TEST_AssertWithinPenultimateLevelOutputRange( + sub_compact->compaction->TEST_AssertWithinProximalLevelOutputRange( c_iter->user_key()); } } else { - assert(penultimate_after_seqno_ == kMaxSequenceNumber); - assert(!use_penultimate_output); + assert(proximal_after_seqno_ == kMaxSequenceNumber); + assert(!use_proximal_output); } #endif // NDEBUG @@ -1431,8 +1646,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // and `close_file_func`. // TODO: it would be better to have the compaction file open/close moved // into `CompactionOutputs` which has the output file information. - status = sub_compact->AddToOutput(*c_iter, use_penultimate_output, - open_file_func, close_file_func); + status = sub_compact->AddToOutput(*c_iter, use_proximal_output, + open_file_func, close_file_func, + prev_iter_output_internal_key); if (!status.ok()) { break; } @@ -1440,10 +1656,12 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:2", static_cast(const_cast*>( &manual_compaction_canceled_))); + + prev_iter_output_key.SetInternalKey(c_iter->key(), + &prev_iter_output_internal_key); + prev_iter_output_internal_key.sequence = ikey.sequence; + prev_iter_output_internal_key.type = ikey.type; c_iter->Next(); - if (c_iter->status().IsManualCompactionPaused()) { - break; - } #ifndef NDEBUG bool stop = false; @@ -1455,13 +1673,33 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { #endif // NDEBUG } - // This number may not be accurate when CompactionIterator was created - // with `must_count_input_entries=false`. + return status; +} + +void CompactionJob::UpdateSubcompactionJobStatsIncrementally( + CompactionIterator* c_iter, CompactionJobStats* compaction_job_stats, + uint64_t cur_cpu_micros, uint64_t& prev_cpu_micros) { + RecordDroppedKeys(c_iter->iter_stats(), compaction_job_stats); + c_iter->ResetRecordCounts(); + RecordCompactionIOStats(); + + assert(cur_cpu_micros >= prev_cpu_micros); + RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME, + cur_cpu_micros - prev_cpu_micros); + prev_cpu_micros = cur_cpu_micros; +} + +void CompactionJob::FinalizeSubcompactionJobStats( + SubcompactionState* sub_compact, CompactionIterator* c_iter, + uint64_t start_cpu_micros, uint64_t prev_cpu_micros, + const CompactionIOStatsSnapshot& io_stats) { + const CompactionIterationStats& c_iter_stats = c_iter->iter_stats(); + assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() || c_iter->HasNumInputEntryScanned()); - sub_compact->compaction_job_stats.has_num_input_records = + sub_compact->compaction_job_stats.has_accurate_num_input_records &= c_iter->HasNumInputEntryScanned(); - sub_compact->compaction_job_stats.num_input_records = + sub_compact->compaction_job_stats.num_input_records += c_iter->NumInputEntryScanned(); sub_compact->compaction_job_stats.num_blobs_read = c_iter_stats.num_blobs_read; @@ -1492,84 +1730,198 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { c_iter_stats.total_blob_bytes_relocated); } - RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); - RecordCompactionIOStats(); + uint64_t cur_cpu_micros = db_options_.clock->CPUMicros(); + + // Record final compaction statistics including dropped keys, I/O stats, + // and CPU time delta from the last periodic measurement + UpdateSubcompactionJobStatsIncrementally(c_iter, + &sub_compact->compaction_job_stats, + cur_cpu_micros, prev_cpu_micros); + + // Finalize timing and I/O statistics + sub_compact->compaction_job_stats.cpu_micros = + cur_cpu_micros - start_cpu_micros + sub_compact->GetWorkerCPUMicros(); + + if (measure_io_stats_) { + sub_compact->compaction_job_stats.file_write_nanos += + IOSTATS(write_nanos) - io_stats.prev_write_nanos; + sub_compact->compaction_job_stats.file_fsync_nanos += + IOSTATS(fsync_nanos) - io_stats.prev_fsync_nanos; + sub_compact->compaction_job_stats.file_range_sync_nanos += + IOSTATS(range_sync_nanos) - io_stats.prev_range_sync_nanos; + sub_compact->compaction_job_stats.file_prepare_write_nanos += + IOSTATS(prepare_write_nanos) - io_stats.prev_prepare_write_nanos; + sub_compact->compaction_job_stats.cpu_micros -= + (IOSTATS(cpu_write_nanos) - io_stats.prev_cpu_write_nanos + + IOSTATS(cpu_read_nanos) - io_stats.prev_cpu_read_nanos) / + 1000; + if (io_stats.prev_perf_level != + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) { + SetPerfLevel(io_stats.prev_perf_level); + } + } +} +Status CompactionJob::FinalizeProcessKeyValueStatus( + ColumnFamilyData* cfd, InternalIterator* input_iter, + CompactionIterator* c_iter, Status status) { if (status.ok() && cfd->IsDropped()) { status = Status::ColumnFamilyDropped("Column family dropped during compaction"); } - if ((status.ok() || status.IsColumnFamilyDropped()) && - shutting_down_->load(std::memory_order_relaxed)) { + if (status.ok() && shutting_down_->load(std::memory_order_relaxed)) { status = Status::ShutdownInProgress("Database shutdown"); } - if ((status.ok() || status.IsColumnFamilyDropped()) && + if (status.ok() && (manual_compaction_canceled_.load(std::memory_order_relaxed))) { status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); } if (status.ok()) { - status = input->status(); + status = input_iter->status(); } if (status.ok()) { status = c_iter->status(); } + return status; +} + +Status CompactionJob::CleanupCompactionFiles( + SubcompactionState* sub_compact, Status status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func) { // Call FinishCompactionOutputFile() even if status is not ok: it needs to // close the output files. Open file function is also passed, in case there's // only range-dels, no file was opened, to save the range-dels, it need to // create a new output file. - status = sub_compact->CloseCompactionFiles(status, open_file_func, - close_file_func); + return sub_compact->CloseCompactionFiles(status, open_file_func, + close_file_func); +} +Status CompactionJob::FinalizeBlobFiles(SubcompactionState* sub_compact, + BlobFileBuilder* blob_file_builder, + Status status) { if (blob_file_builder) { if (status.ok()) { status = blob_file_builder->Finish(); } else { blob_file_builder->Abandon(status); } - blob_file_builder.reset(); sub_compact->Current().UpdateBlobStats(); } - uint64_t cur_cpu_micros = db_options_.clock->CPUMicros(); - sub_compact->compaction_job_stats.cpu_micros = - cur_cpu_micros - prev_cpu_micros; - RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME, - cur_cpu_micros - last_cpu_micros); + return status; +} - if (measure_io_stats_) { - sub_compact->compaction_job_stats.file_write_nanos += - IOSTATS(write_nanos) - prev_write_nanos; - sub_compact->compaction_job_stats.file_fsync_nanos += - IOSTATS(fsync_nanos) - prev_fsync_nanos; - sub_compact->compaction_job_stats.file_range_sync_nanos += - IOSTATS(range_sync_nanos) - prev_range_sync_nanos; - sub_compact->compaction_job_stats.file_prepare_write_nanos += - IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos; - sub_compact->compaction_job_stats.cpu_micros -= - (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos + - IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) / - 1000; - if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) { - SetPerfLevel(prev_perf_level); - } +void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { + TEST_SYNC_POINT("CompactionJob::ProcessKeyValueCompaction:Start"); + assert(sub_compact); + assert(sub_compact->compaction); + + if (!ShouldUseLocalCompaction(sub_compact)) { + return; } + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PROCESS_KV); + + const uint64_t start_cpu_micros = db_options_.clock->CPUMicros(); + uint64_t prev_cpu_micros = start_cpu_micros; + const CompactionIOStatsSnapshot io_stats = InitializeIOStats(); + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + const CompactionFilter* compaction_filter; + std::unique_ptr compaction_filter_from_factory = nullptr; + Status filter_status = SetupAndValidateCompactionFilter( + sub_compact, cfd->ioptions().compaction_filter, compaction_filter, + compaction_filter_from_factory); + if (!filter_status.ok()) { + sub_compact->status = filter_status; + return; + } + + NotifyOnSubcompactionBegin(sub_compact); + + SubcompactionKeyBoundaries boundaries(sub_compact->start, sub_compact->end); + SubcompactionInternalIterators iterators; + ReadOptions read_options; + const WriteOptions write_options(Env::IOPriority::IO_LOW, + Env::IOActivity::kCompaction); + + InternalIterator* input_iter = CreateInputIterator( + sub_compact, cfd, iterators, boundaries, read_options); + + assert(input_iter); + + Status status = + MaybeResumeSubcompactionProgressOnInputIterator(sub_compact, input_iter); + + if (status.IsNotFound()) { + input_iter->SeekToFirst(); + } else if (!status.ok()) { + sub_compact->status = status; + return; + } + + MergeHelper merge( + env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(), + compaction_filter, db_options_.info_log.get(), + false /* internal key corruption is expected */, + job_context_->GetLatestSnapshotSequence(), job_context_->snapshot_checker, + compact_->compaction->level(), db_options_.stats); + std::unique_ptr blob_file_builder; + + auto c_iter = + CreateCompactionIterator(sub_compact, cfd, input_iter, compaction_filter, + merge, blob_file_builder, write_options); + assert(c_iter); + c_iter->SeekToFirst(); + + TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); + TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1", + static_cast(const_cast*>( + &manual_compaction_canceled_))); + + auto [open_file_func, close_file_func] = + CreateFileHandlers(sub_compact, boundaries); + + status = ProcessKeyValue(sub_compact, cfd, c_iter.get(), open_file_func, + close_file_func, prev_cpu_micros); + + status = FinalizeProcessKeyValueStatus(cfd, input_iter, c_iter.get(), status); + + FinalizeSubcompaction(sub_compact, status, open_file_func, close_file_func, + blob_file_builder.get(), c_iter.get(), input_iter, + start_cpu_micros, prev_cpu_micros, io_stats); + + NotifyOnSubcompactionCompleted(sub_compact); +} + +void CompactionJob::FinalizeSubcompaction( + SubcompactionState* sub_compact, Status status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func, + BlobFileBuilder* blob_file_builder, CompactionIterator* c_iter, + [[maybe_unused]] InternalIterator* input_iter, uint64_t start_cpu_micros, + uint64_t prev_cpu_micros, const CompactionIOStatsSnapshot& io_stats) { + status = CleanupCompactionFiles(sub_compact, status, open_file_func, + close_file_func); + status = FinalizeBlobFiles(sub_compact, blob_file_builder, status); + + FinalizeSubcompactionJobStats(sub_compact, c_iter, start_cpu_micros, + prev_cpu_micros, io_stats); + #ifdef ROCKSDB_ASSERT_STATUS_CHECKED if (!status.ok()) { if (c_iter) { c_iter->status().PermitUncheckedError(); } - if (input) { - input->status().PermitUncheckedError(); + if (input_iter) { + input_iter->status().PermitUncheckedError(); } } #endif // ROCKSDB_ASSERT_STATUS_CHECKED - blob_counter.reset(); - clip.reset(); - raw_input.reset(); sub_compact->status = status; - NotifyOnSubcompactionCompleted(sub_compact); } uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const { @@ -1614,9 +1966,11 @@ void CompactionJob::RecordDroppedKeys( } Status CompactionJob::FinishCompactionOutputFile( - const Status& input_status, SubcompactionState* sub_compact, - CompactionOutputs& outputs, const Slice& next_table_min_key, - const Slice* comp_start_user_key, const Slice* comp_end_user_key) { + const Status& input_status, + const ParsedInternalKey& prev_iter_output_internal_key, + const Slice& next_table_min_key, const Slice* comp_start_user_key, + const Slice* comp_end_user_key, const CompactionIterator* c_iter, + SubcompactionState* sub_compact, CompactionOutputs& outputs) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_SYNC_FILE); assert(sub_compact != nullptr); @@ -1634,24 +1988,20 @@ Status CompactionJob::FinishCompactionOutputFile( Status s = input_status; // Add range tombstones - auto earliest_snapshot = kMaxSequenceNumber; - if (existing_snapshots_.size() > 0) { - earliest_snapshot = existing_snapshots_[0]; - } if (s.ok()) { // Inclusive lower bound, exclusive upper bound std::pair keep_seqno_range{ 0, kMaxSequenceNumber}; if (sub_compact->compaction->SupportsPerKeyPlacement()) { - if (outputs.IsPenultimateLevel()) { - keep_seqno_range.first = penultimate_after_seqno_; + if (outputs.IsProximalLevel()) { + keep_seqno_range.first = proximal_after_seqno_; } else { - keep_seqno_range.second = penultimate_after_seqno_; + keep_seqno_range.second = proximal_after_seqno_; } } CompactionIterationStats range_del_out_stats; // NOTE1: Use `bottommost_level_ = true` for both bottommost and - // output_to_penultimate_level compaction here, as it's only used to decide + // output_to_proximal_level compaction here, as it's only used to decide // if range dels could be dropped. (Logically, we are taking a single sorted // run returned from CompactionIterator and physically splitting it between // two output levels.) @@ -1663,7 +2013,7 @@ Status CompactionJob::FinishCompactionOutputFile( s = outputs.AddRangeDels(*sub_compact->RangeDelAgg(), comp_start_user_key, comp_end_user_key, range_del_out_stats, bottommost_level_, cfd->internal_comparator(), - earliest_snapshot, keep_seqno_range, + earliest_snapshot_, keep_seqno_range, next_table_min_key, full_history_ts_low_); } RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); @@ -1720,14 +2070,11 @@ Status CompactionJob::FinishCompactionOutputFile( if (s.ok()) { tp = outputs.GetTableProperties(); } - if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { // If there is nothing to output, no necessary to generate a sst file. // This happens when the output level is bottom level, at the same time // the sub_compact output nothing. - std::string fname = - TableFileName(sub_compact->compaction->immutable_options().cf_paths, - meta->fd.GetNumber(), meta->fd.GetPathId()); + std::string fname = GetTableFileName(meta->fd.GetNumber()); // TODO(AR) it is not clear if there are any larger implications if // DeleteFile fails here @@ -1797,10 +2144,99 @@ Status CompactionJob::FinishCompactionOutputFile( } } + if (s.ok() && ShouldUpdateSubcompactionProgress(sub_compact, c_iter, + prev_iter_output_internal_key, + next_table_min_key, meta)) { + UpdateSubcompactionProgress(c_iter, next_table_min_key, sub_compact); + s = PersistSubcompactionProgress(sub_compact); + } outputs.ResetBuilder(); return s; } +bool CompactionJob::ShouldUpdateSubcompactionProgress( + const SubcompactionState* sub_compact, const CompactionIterator* c_iter, + const ParsedInternalKey& prev_iter_output_internal_key, + const Slice& next_table_min_internal_key, const FileMetaData* meta) const { + const auto* cfd = sub_compact->compaction->column_family_data(); + // No need to update when the progress will not get persisted + if (compaction_progress_writer_ == nullptr) { + return false; + } + + // No need to update for a new empty output + if (meta == nullptr) { + return false; + } + + // TODO(hx235): save progress even on the last output file + if (next_table_min_internal_key.empty()) { + return false; + } + + // LIMITATION: Persisting compaction progress with timestamp + // is not supported since the feature of persisting timestamp of the key in + // SST files itself is still experimental + size_t ts_sz = cfd->user_comparator()->timestamp_size(); + if (ts_sz > 0) { + return false; + } + + // LIMITATION: Compaction progress persistence disabled for file boundaries + // containing range deletions. Range deletions can span file boundaries, + // making it difficult to ensure adjacent output tables have different user + // keys. See the last check for why different users keys of adjacent output + // tables are needed + const ValueType next_table_min_internal_key_type = + ExtractValueType(next_table_min_internal_key); + const ValueType prev_iter_output_internal_key_type = + prev_iter_output_internal_key.user_key.empty() + ? ValueType::kTypeValue + : prev_iter_output_internal_key.type; + + // Range deletes truncated to align with file boundaries may be output by the + // compaction iterator with `ValueType::kTypeMaxValid` instead of the original + // type. + if ((next_table_min_internal_key_type == ValueType::kTypeRangeDeletion || + next_table_min_internal_key_type == ValueType::kTypeMaxValid) || + (prev_iter_output_internal_key_type == ValueType::kTypeRangeDeletion || + prev_iter_output_internal_key_type == ValueType::kTypeMaxValid)) { + return false; + } + + // LIMITATION: Compaction progress persistence disabled when adjacent output + // tables share the same user key at boundaries. This ensures a simple Seek() + // of the next key when resuming can process all versions of a user key + const Slice next_table_min_user_key = + ExtractUserKey(next_table_min_internal_key); + const Slice prev_table_last_user_key = + prev_iter_output_internal_key.user_key.empty() + ? Slice() + : prev_iter_output_internal_key.user_key; + + if (cfd->user_comparator()->EqualWithoutTimestamp(next_table_min_user_key, + prev_table_last_user_key)) { + return false; + } + + // LIMITATION: Don't save progress if the current key has already been scanned + // (looked ahead) in the input but not yet output. This can happen with merge + // operations, single deletes, and deletes at the bottommost level where + // CompactionIterator needs to look ahead to process multiple entries for the + // same user key before outputting a result. If we saved progress and resumed + // at this boundary, the resumed session would see and process the same input + // key again through Seek(), leading to incorrect double-counting in + // number of processed input entries and input count verification failure + // + // TODO(hx235): Offset num_processed_input_records to avoid double counting + // instead of disabling progress persistence. + if (c_iter->IsCurrentKeyAlreadyScanned()) { + return false; + } + + return true; +} + Status CompactionJob::InstallCompactionResults(bool* compaction_released) { assert(compact_); @@ -1814,22 +2250,22 @@ Status CompactionJob::InstallCompactionResults(bool* compaction_released) { { Compaction::InputLevelSummaryBuffer inputs_summary; - if (compaction_stats_.has_penultimate_level_output) { + if (internal_stats_.has_proximal_level_output) { ROCKS_LOG_BUFFER( log_buffer_, - "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64 + "[%s] [JOB %d] Compacted %s => output_to_proximal_level: %" PRIu64 " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes", compaction->column_family_data()->GetName().c_str(), job_id_, compaction->InputLevelSummary(&inputs_summary), - compaction_stats_.penultimate_level_stats.bytes_written, - compaction_stats_.stats.bytes_written, - compaction_stats_.TotalBytesWritten()); + internal_stats_.proximal_level_stats.bytes_written, + internal_stats_.output_level_stats.bytes_written, + internal_stats_.TotalBytesWritten()); } else { ROCKS_LOG_BUFFER(log_buffer_, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", compaction->column_family_data()->GetName().c_str(), job_id_, compaction->InputLevelSummary(&inputs_summary), - compaction_stats_.TotalBytesWritten()); + internal_stats_.TotalBytesWritten()); } } @@ -1926,6 +2362,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, // no need to lock because VersionSet::next_file_number_ is atomic uint64_t file_number = versions_->NewFileNumber(); +#ifndef NDEBUG + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::OpenCompactionOutputFile::NewFileNumber", &file_number); +#endif std::string fname = GetTableFileName(file_number); // Fire events. ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); @@ -1942,21 +2382,18 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, // Pass temperature of the last level files to FileSystem. FileOptions fo_copy = file_options_; - Temperature temperature = sub_compact->compaction->output_temperature(); - Temperature last_level_temp = - sub_compact->compaction->mutable_cf_options().last_level_temperature; - // Here last_level_temperature supersedes default_write_temperature, when - // enabled and applicable - if (last_level_temp != Temperature::kUnknown && - sub_compact->compaction->is_last_level() && - !outputs.IsPenultimateLevel()) { - temperature = last_level_temp; - } + auto temperature = + sub_compact->compaction->GetOutputTemperature(outputs.IsProximalLevel()); fo_copy.temperature = temperature; + fo_copy.write_hint = write_hint_; Status s; IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy); s = io_s; + if (io_s.ok()) { + // Track the SST file path for cleanup on abort. + outputs.AddOutputFilePath(fname); + } if (sub_compact->io_status.ok()) { sub_compact->io_status = io_s; // Since this error is really a copy of the io_s that is checked below as s, @@ -2038,7 +2475,9 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, } writable_file->SetIOPriority(GetRateLimiterPriority()); - writable_file->SetWriteLifeTimeHint(write_hint_); + // Subsequent attempts to override the hint via SetWriteLifeTimeHint + // with the very same value will be ignored by the fs. + writable_file->SetWriteLifeTimeHint(fo_copy.write_hint); FileTypeSet tmp_set = db_options_.checksum_handoff_file_types; writable_file->SetPreallocationBlockSize(static_cast( sub_compact->compaction->OutputFilePreallocationSize())); @@ -2063,7 +2502,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, bottommost_level_, TableFileCreationReason::kCompaction, 0 /* oldest_key_time */, current_time, db_id_, db_session_id_, sub_compact->compaction->max_output_file_size(), file_number, - penultimate_after_seqno_ /*last_level_inclusive_max_seqno_threshold*/); + proximal_after_seqno_ /*last_level_inclusive_max_seqno_threshold*/); outputs.NewBuilder(tboptions); @@ -2087,16 +2526,43 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { } } // namespace -bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) { +bool CompactionJob::UpdateInternalStatsFromInputFiles( + uint64_t* num_input_range_del) { assert(compact_); Compaction* compaction = compact_->compaction; - compaction_stats_.stats.num_input_files_in_non_output_levels = 0; - compaction_stats_.stats.num_input_files_in_output_level = 0; + internal_stats_.output_level_stats.num_input_files_in_non_output_levels = 0; + internal_stats_.output_level_stats.num_input_files_in_output_level = 0; bool has_error = false; const ReadOptions read_options(Env::IOActivity::kCompaction); const auto& input_table_properties = compaction->GetInputTableProperties(); + + // Check all input files for old block-based SST format_version. Why? Old + // block-based SST files from roughly version 5.0 to 5.18 could produce + // inaccurate num_entries counts due to the evolution of its handling along + // with num_range_deletions. We have to disable some paranoid checks when + // compacting files from such an old release. However, we don't have great + // information to identify those files, so we heuristically over-approximate + // that set of files using + // (a) format_version < 5, which will be true for any files from RocksDB < + // 6.6.0 and should not be true for any recent production files + // (b) to avoid including non-block-based SST files (which still use older + // format_version markers, and do not support DeleteRange), we also require + // the presence of the user property "rocksdb.block.based.table.index.type", + // which was added in RocksDB 2.8 and is always present in block-based tables. + for (const auto& tp_pair : input_table_properties) { + if (tp_pair.second && tp_pair.second->format_version < 5) { + // Check for block-based table by looking for its index type property + const auto& user_props = tp_pair.second->user_collected_properties; + if (user_props.find(BlockBasedTablePropertyNames::kIndexType) != + user_props.end()) { + job_stats_->has_accurate_num_input_records = false; + break; + } + } + } + for (int input_level = 0; input_level < static_cast(compaction->num_input_levels()); ++input_level) { @@ -2104,13 +2570,14 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) { size_t num_input_files = flevel->num_files; uint64_t* bytes_read; if (compaction->level(input_level) != compaction->output_level()) { - compaction_stats_.stats.num_input_files_in_non_output_levels += + internal_stats_.output_level_stats.num_input_files_in_non_output_levels += static_cast(num_input_files); - bytes_read = &compaction_stats_.stats.bytes_read_non_output_levels; + bytes_read = + &internal_stats_.output_level_stats.bytes_read_non_output_levels; } else { - compaction_stats_.stats.num_input_files_in_output_level += + internal_stats_.output_level_stats.num_input_files_in_output_level += static_cast(num_input_files); - bytes_read = &compaction_stats_.stats.bytes_read_output_level; + bytes_read = &internal_stats_.output_level_stats.bytes_read_output_level; } for (size_t i = 0; i < num_input_files; ++i) { const FileMetaData* file_meta = flevel->files[i].file_metadata; @@ -2130,7 +2597,8 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) { has_error = true; } } - compaction_stats_.stats.num_input_records += file_input_entries; + internal_stats_.output_level_stats.num_input_records += + file_input_entries; if (num_input_range_del) { *num_input_range_del += file_num_range_del; } @@ -2141,62 +2609,123 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) { size_t num_filtered_input_files = filtered_flevel.size(); uint64_t* bytes_skipped; if (compaction->level(input_level) != compaction->output_level()) { - compaction_stats_.stats.num_filtered_input_files_in_non_output_levels += + internal_stats_.output_level_stats + .num_filtered_input_files_in_non_output_levels += static_cast(num_filtered_input_files); - bytes_skipped = &compaction_stats_.stats.bytes_skipped_non_output_levels; + bytes_skipped = + &internal_stats_.output_level_stats.bytes_skipped_non_output_levels; } else { - compaction_stats_.stats.num_filtered_input_files_in_output_level += + internal_stats_.output_level_stats + .num_filtered_input_files_in_output_level += static_cast(num_filtered_input_files); - bytes_skipped = &compaction_stats_.stats.bytes_skipped_output_level; + bytes_skipped = + &internal_stats_.output_level_stats.bytes_skipped_output_level; } for (const FileMetaData* filtered_file_meta : filtered_flevel) { *bytes_skipped += filtered_file_meta->fd.GetFileSize(); } } - assert(compaction_job_stats_); - compaction_stats_.stats.bytes_read_blob = - compaction_job_stats_->total_blob_bytes_read; - - compaction_stats_.stats.num_dropped_records = - compaction_stats_.DroppedRecords(); + // TODO - find a better place to set these two + assert(job_stats_); + internal_stats_.output_level_stats.bytes_read_blob = + job_stats_->total_blob_bytes_read; + internal_stats_.output_level_stats.num_dropped_records = + internal_stats_.DroppedRecords(); return !has_error; } -void CompactionJob::UpdateCompactionJobStats( - const InternalStats::CompactionStats& stats) const { - compaction_job_stats_->elapsed_micros = stats.micros; - +void CompactionJob::UpdateCompactionJobInputStatsFromInternalStats( + const InternalStats::CompactionStatsFull& internal_stats, + uint64_t num_input_range_del) const { + assert(job_stats_); // input information - compaction_job_stats_->total_input_bytes = - stats.bytes_read_non_output_levels + stats.bytes_read_output_level; - compaction_job_stats_->num_input_records = stats.num_input_records; - compaction_job_stats_->num_input_files = - stats.num_input_files_in_non_output_levels + - stats.num_input_files_in_output_level; - compaction_job_stats_->num_input_files_at_output_level = - stats.num_input_files_in_output_level; - compaction_job_stats_->num_filtered_input_files = - stats.num_filtered_input_files_in_non_output_levels + - stats.num_filtered_input_files_in_output_level; - compaction_job_stats_->num_filtered_input_files_at_output_level = - stats.num_filtered_input_files_in_output_level; - compaction_job_stats_->total_skipped_input_bytes = - stats.bytes_skipped_non_output_levels + stats.bytes_skipped_output_level; + job_stats_->total_input_bytes = + internal_stats.output_level_stats.bytes_read_non_output_levels + + internal_stats.output_level_stats.bytes_read_output_level; + job_stats_->num_input_records = + internal_stats.output_level_stats.num_input_records - num_input_range_del; + job_stats_->num_input_files = + internal_stats.output_level_stats.num_input_files_in_non_output_levels + + internal_stats.output_level_stats.num_input_files_in_output_level; + job_stats_->num_input_files_at_output_level = + internal_stats.output_level_stats.num_input_files_in_output_level; + job_stats_->num_filtered_input_files = + internal_stats.output_level_stats + .num_filtered_input_files_in_non_output_levels + + internal_stats.output_level_stats + .num_filtered_input_files_in_output_level; + job_stats_->num_filtered_input_files_at_output_level = + internal_stats.output_level_stats + .num_filtered_input_files_in_output_level; + job_stats_->total_skipped_input_bytes = + internal_stats.output_level_stats.bytes_skipped_non_output_levels + + internal_stats.output_level_stats.bytes_skipped_output_level; + + if (internal_stats.has_proximal_level_output) { + job_stats_->total_input_bytes += + internal_stats.proximal_level_stats.bytes_read_non_output_levels + + internal_stats.proximal_level_stats.bytes_read_output_level; + job_stats_->num_input_records += + internal_stats.proximal_level_stats.num_input_records; + job_stats_->num_input_files += + internal_stats.proximal_level_stats + .num_input_files_in_non_output_levels + + internal_stats.proximal_level_stats.num_input_files_in_output_level; + job_stats_->num_input_files_at_output_level += + internal_stats.proximal_level_stats.num_input_files_in_output_level; + job_stats_->num_filtered_input_files += + internal_stats.proximal_level_stats + .num_filtered_input_files_in_non_output_levels + + internal_stats.proximal_level_stats + .num_filtered_input_files_in_output_level; + job_stats_->num_filtered_input_files_at_output_level += + internal_stats.proximal_level_stats + .num_filtered_input_files_in_output_level; + job_stats_->total_skipped_input_bytes += + internal_stats.proximal_level_stats.bytes_skipped_non_output_levels + + internal_stats.proximal_level_stats.bytes_skipped_output_level; + } +} - // output information - compaction_job_stats_->total_output_bytes = stats.bytes_written; - compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; - compaction_job_stats_->num_output_records = stats.num_output_records; - compaction_job_stats_->num_output_files = stats.num_output_files; - compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; +void CompactionJob::UpdateCompactionJobOutputStatsFromInternalStats( + const Status& status, + const InternalStats::CompactionStatsFull& internal_stats) const { + assert(job_stats_); + job_stats_->elapsed_micros = internal_stats.output_level_stats.micros; + job_stats_->cpu_micros = internal_stats.output_level_stats.cpu_micros; - if (stats.num_output_files > 0) { + // output information + job_stats_->total_output_bytes = + internal_stats.output_level_stats.bytes_written; + job_stats_->total_output_bytes_blob = + internal_stats.output_level_stats.bytes_written_blob; + job_stats_->num_output_records = + internal_stats.output_level_stats.num_output_records; + job_stats_->num_output_files = + internal_stats.output_level_stats.num_output_files; + job_stats_->num_output_files_blob = + internal_stats.output_level_stats.num_output_files_blob; + + if (internal_stats.has_proximal_level_output) { + job_stats_->total_output_bytes += + internal_stats.proximal_level_stats.bytes_written; + job_stats_->total_output_bytes_blob += + internal_stats.proximal_level_stats.bytes_written_blob; + job_stats_->num_output_records += + internal_stats.proximal_level_stats.num_output_records; + job_stats_->num_output_files += + internal_stats.proximal_level_stats.num_output_files; + job_stats_->num_output_files_blob += + internal_stats.proximal_level_stats.num_output_files_blob; + } + + if (status.ok() && job_stats_->num_output_files > 0) { CopyPrefix(compact_->SmallestUserKey(), CompactionJobStats::kMaxPrefixLength, - &compaction_job_stats_->smallest_output_key_prefix); + &job_stats_->smallest_output_key_prefix); CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength, - &compaction_job_stats_->largest_output_key_prefix); + &job_stats_->largest_output_key_prefix); } } @@ -2217,8 +2746,8 @@ void CompactionJob::LogCompaction() { cfd->GetName().c_str(), scratch); // build event logger report auto stream = event_logger_->Log(); - stream << "job" << job_id_ << "event" << "compaction_started" - << "compaction_reason" + stream << "job" << job_id_ << "event" << "compaction_started" << "cf_name" + << cfd->GetName() << "compaction_reason" << GetCompactionReasonString(compaction->compaction_reason()); for (size_t i = 0; i < compaction->num_input_levels(); ++i) { stream << ("files_L" + std::to_string(compaction->level(i))); @@ -2230,23 +2759,24 @@ void CompactionJob::LogCompaction() { } stream << "score" << compaction->score() << "input_data_size" << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno" - << (existing_snapshots_.empty() + << (job_context_->snapshot_seqs.empty() ? int64_t{-1} // Use -1 for "none" - : static_cast(existing_snapshots_[0])); + : static_cast( + job_context_->GetEarliestSnapshotSequence())); if (compaction->SupportsPerKeyPlacement()) { - stream << "prenultimate_after_seqno" << penultimate_after_seqno_; + stream << "proximal_after_seqno" << proximal_after_seqno_; stream << "preserve_seqno_after" << preserve_seqno_after_; - stream << "penultimate_output_level" << compaction->GetPenultimateLevel(); - stream << "penultimate_output_range" - << GetCompactionPenultimateOutputRangeTypeString( - compaction->GetPenultimateOutputRangeType()); + stream << "proximal_output_level" << compaction->GetProximalLevel(); + stream << "proximal_output_range" + << GetCompactionProximalOutputRangeTypeString( + compaction->GetProximalOutputRangeType()); - if (compaction->GetPenultimateOutputRangeType() == - Compaction::PenultimateOutputRangeType::kDisabled) { + if (compaction->GetProximalOutputRangeType() == + Compaction::ProximalOutputRangeType::kDisabled) { ROCKS_LOG_WARN( db_options_.info_log, - "[%s] [JOB %d] Penultimate level output is disabled, likely " - "because of the range conflict in the penultimate level", + "[%s] [JOB %d] Proximal level output is disabled, likely " + "because of the range conflict in the proximal level", cfd->GetName().c_str(), job_id_); } } @@ -2271,4 +2801,409 @@ Env::IOPriority CompactionJob::GetRateLimiterPriority() { return Env::IO_LOW; } +Status CompactionJob::ReadTablePropertiesDirectly( + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, + const FileMetaData* file_meta, const ReadOptions& read_options, + std::shared_ptr* tp) { + std::unique_ptr file; + std::string file_name = GetTableFileName(file_meta->fd.GetNumber()); + FileOptions fopts = file_options_; + fopts.file_checksum = file_meta->file_checksum; + fopts.file_checksum_func_name = file_meta->file_checksum_func_name; + Status s = ioptions.fs->NewRandomAccessFile(file_name, fopts, &file, + nullptr /* dbg */); + if (!s.ok()) { + return s; + } + + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), file_name, ioptions.clock, io_tracer_, + ioptions.stats, Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, ioptions.rate_limiter.get(), + ioptions.listeners)); + + std::unique_ptr props; + + uint64_t magic_number = kBlockBasedTableMagicNumber; + + const auto* table_factory = moptions.table_factory.get(); + if (table_factory == nullptr) { + return Status::Incomplete("Table factory is not set"); + } else { + const auto& table_factory_name = table_factory->Name(); + if (table_factory_name == TableFactory::kPlainTableName()) { + magic_number = kPlainTableMagicNumber; + } else if (table_factory_name == TableFactory::kCuckooTableName()) { + magic_number = kCuckooTableMagicNumber; + } + } + + s = ReadTableProperties(file_reader.get(), file_meta->fd.GetFileSize(), + magic_number, ioptions, read_options, &props); + if (!s.ok()) { + return s; + } + + *tp = std::move(props); + return s; +} + +Status CompactionJob::ReadOutputFilesTableProperties( + const autovector& output_files, + const ReadOptions& read_options, + std::vector>& + output_files_table_properties, + bool is_proximal_level) { + assert(!output_files.empty()); + + static const char* level_type = + is_proximal_level ? "proximal output" : "output"; + + output_files_table_properties.reserve(output_files.size()); + + Status s; + + for (const FileMetaData& metadata : output_files) { + std::shared_ptr tp; + s = ReadTablePropertiesDirectly(compact_->compaction->immutable_options(), + compact_->compaction->mutable_cf_options(), + &metadata, read_options, &tp); + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Failed to read table properties for %s level output file #%" PRIu64 + ": %s", + level_type, metadata.fd.GetNumber(), s.ToString().c_str()); + return s; + } + + if (tp == nullptr) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Empty table property for %s level output file #%" PRIu64 + "", + level_type, metadata.fd.GetNumber()); + + s = Status::Corruption("Empty table property for " + + std::string(level_type) + + " level output files during resuming"); + return s; + } + output_files_table_properties.push_back(tp); + } + return s; +} + +void CompactionJob::RestoreCompactionOutputs( + const ColumnFamilyData* cfd, + const std::vector>& + output_files_table_properties, + SubcompactionProgressPerLevel& subcompaction_progress_per_level, + CompactionOutputs* outputs_to_restore) { + assert(outputs_to_restore->GetOutputs().size() == 0); + + const auto& output_files = subcompaction_progress_per_level.GetOutputFiles(); + + for (size_t i = 0; i < output_files.size(); i++) { + FileMetaData file_copy = output_files[i]; + + outputs_to_restore->AddOutput(std::move(file_copy), + cfd->internal_comparator(), + paranoid_file_checks_, true /* finished */); + + outputs_to_restore->UpdateTableProperties( + *output_files_table_properties[i]); + } + + outputs_to_restore->SetNumOutputRecords( + subcompaction_progress_per_level.GetNumProcessedOutputRecords()); +} + +// Attempt to resume compaction from a previously persisted compaction progress. +// +// RETURNS: +// - Status::OK(): +// * Input iterator positioned at next unprocessed key +// * CompactionOutputs objects fully restored for both output and proximal +// output levels in SubcompactionState +// * Compaction job statistics accurately reflect input and output records +// processed for record count verification +// * File number generation advanced to prevent conflicts with existing outputs +// - Status::NotFound(): No valid progress to resume from +// - Status::Corruption(): Resume key is invalid, beyond input range, or output +// restoration failed +Status CompactionJob::MaybeResumeSubcompactionProgressOnInputIterator( + SubcompactionState* sub_compact, InternalIterator* input_iter) { + const ReadOptions read_options(Env::IOActivity::kCompaction); + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + SubcompactionProgress& subcompaction_progress = + sub_compact->GetSubcompactionProgressRef(); + + if (subcompaction_progress.output_level_progress + .GetNumProcessedOutputRecords() == 0 && + subcompaction_progress.proximal_output_level_progress + .GetNumProcessedOutputRecords() == 0) { + return Status::NotFound("No subcompaction progress to resume"); + } + + ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Resuming compaction : %s", + cfd->GetName().c_str(), job_id_, + subcompaction_progress.ToString().c_str()); + + input_iter->Seek(subcompaction_progress.next_internal_key_to_compact); + + if (!input_iter->Valid()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "[%s] [JOB %d] Iterator is invalid after " + "seeking to the key to resume. This indicates the key is " + "incorrectly beyond the input data range.", + cfd->GetName().c_str(), job_id_); + return Status::Corruption( + "The key to resume is beyond the input data range"); + } else if (!input_iter->status().ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "[%s] [JOB %d] Iterator has error after seeking to " + "the key to resume: %s", + cfd->GetName().c_str(), job_id_, + input_iter->status().ToString().c_str()); + return Status::Corruption( + "Iterator has error status after seeking to the key: " + + input_iter->status().ToString()); + } + + sub_compact->compaction_job_stats.has_accurate_num_input_records = + subcompaction_progress.num_processed_input_records != 0; + + sub_compact->compaction_job_stats.num_input_records = + subcompaction_progress.num_processed_input_records; + + for (const bool& is_proximal_level : {false, true}) { + if (is_proximal_level && + !sub_compact->compaction->SupportsPerKeyPlacement()) { + continue; + } + + Status s; + SubcompactionProgressPerLevel& subcompaction_progress_per_level = + is_proximal_level + ? subcompaction_progress.proximal_output_level_progress + : subcompaction_progress.output_level_progress; + + const auto& output_files = + subcompaction_progress_per_level.GetOutputFiles(); + + std::vector> + output_files_table_properties; + + // TODO(hx235): investigate if we can skip reading properties to save read + // IO + s = ReadOutputFilesTableProperties(output_files, read_options, + output_files_table_properties); + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] Failed to read table properties for %s output level" + "files " + "during resume: %s.", + cfd->GetName().c_str(), job_id_, is_proximal_level ? "proximal" : "", + s.ToString().c_str()); + return Status::Corruption( + "Not able to resume due to table property reading error " + + s.ToString()); + } + + RestoreCompactionOutputs(cfd, output_files_table_properties, + subcompaction_progress_per_level, + sub_compact->Outputs(is_proximal_level)); + + // Skip past all the used file numbers to avoid creating new output files + // after resumption that conflict with the existing output files + for (const auto& file_meta : output_files) { + uint64_t file_number = file_meta.fd.GetNumber(); + while (versions_->NewFileNumber() <= file_number) { + versions_->FetchAddFileNumber(1); + } + } + } + + return Status::OK(); +} + +void CompactionJob::UpdateSubcompactionProgress( + const CompactionIterator* c_iter, const Slice next_table_min_key, + SubcompactionState* sub_compact) { + assert(c_iter); + SubcompactionProgress& subcompaction_progress = + sub_compact->GetSubcompactionProgressRef(); + + IterKey next_ikey_to_compact; + next_ikey_to_compact.SetInternalKey(ExtractUserKey(next_table_min_key), + kMaxSequenceNumber, kValueTypeForSeek); + subcompaction_progress.next_internal_key_to_compact = + next_ikey_to_compact.GetInternalKey().ToString(); + + // Track total processed input records for progress reporting by combining: + // - Resumed count: records already processed before compaction was + // interrupted + // - Current count: records scanned in the current compaction session + // Only update when both tracking mechanisms provide accurate counts to ensure + // reliability. + subcompaction_progress.num_processed_input_records = + c_iter->HasNumInputEntryScanned() && + sub_compact->compaction_job_stats.has_accurate_num_input_records + ? c_iter->NumInputEntryScanned() + + sub_compact->compaction_job_stats.num_input_records + : 0; + + UpdateSubcompactionProgressPerLevel( + sub_compact, false /* is_proximal_level */, subcompaction_progress); + + if (sub_compact->compaction->SupportsPerKeyPlacement()) { + UpdateSubcompactionProgressPerLevel( + sub_compact, true /* is_proximal_level */, subcompaction_progress); + } +} + +void CompactionJob::UpdateSubcompactionProgressPerLevel( + SubcompactionState* sub_compact, bool is_proximal_level, + SubcompactionProgress& subcompaction_progress) { + SubcompactionProgressPerLevel& subcompaction_progress_per_level = + is_proximal_level ? subcompaction_progress.proximal_output_level_progress + : subcompaction_progress.output_level_progress; + + subcompaction_progress_per_level.SetNumProcessedOutputRecords( + sub_compact->OutputStats(is_proximal_level)->num_output_records); + + const auto& prev_output_files = + subcompaction_progress_per_level.GetOutputFiles(); + + const auto& current_output_files = + sub_compact->Outputs(is_proximal_level)->GetOutputs(); + + for (size_t i = prev_output_files.size(); i < current_output_files.size(); + i++) { + subcompaction_progress_per_level.AddToOutputFiles( + current_output_files[i].meta); + } +} + +Status CompactionJob::PersistSubcompactionProgress( + SubcompactionState* sub_compact) { + SubcompactionProgress& subcompaction_progress = + sub_compact->GetSubcompactionProgressRef(); + + assert(compaction_progress_writer_); + + VersionEdit edit; + edit.SetSubcompactionProgress(subcompaction_progress); + + std::string record; + if (!edit.EncodeTo(&record)) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] Failed to encode subcompaction " + "progress", + compact_->compaction->column_family_data()->GetName().c_str(), job_id_); + return Status::Corruption("Failed to encode subcompaction progress"); + } + + WriteOptions write_options(Env::IOActivity::kCompaction); + Status s = compaction_progress_writer_->AddRecord(write_options, record); + IOOptions opts; + if (s.ok()) { + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + } + if (s.ok()) { + s = compaction_progress_writer_->file()->Sync(opts, db_options_.use_fsync); + } + + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] Failed to persist subcompaction " + "progress: %s", + compact_->compaction->column_family_data()->GetName().c_str(), job_id_, + s.ToString().c_str()); + return s; + } + + subcompaction_progress.output_level_progress + .UpdateLastPersistedOutputFilesCount(); + + subcompaction_progress.proximal_output_level_progress + .UpdateLastPersistedOutputFilesCount(); + + return Status::OK(); +} + +Status CompactionJob::VerifyInputRecordCount( + uint64_t num_input_range_del) const { + size_t ts_sz = compact_->compaction->column_family_data() + ->user_comparator() + ->timestamp_size(); + // When trim_ts_ is non-empty, CompactionIterator takes + // HistoryTrimmingIterator as input iterator and sees a trimmed view of + // input keys. So the number of keys it processed is not suitable for + // verification here. + // TODO: support verification when trim_ts_ is non-empty. + if (!(ts_sz > 0 && !trim_ts_.empty())) { + assert(internal_stats_.output_level_stats.num_input_records > 0); + // TODO: verify the number of range deletion entries. + uint64_t expected = internal_stats_.output_level_stats.num_input_records - + num_input_range_del; + uint64_t actual = job_stats_->num_input_records; + if (expected != actual) { + char scratch[2345]; + compact_->compaction->Summary(scratch, sizeof(scratch)); + std::string msg = + "Compaction number of input keys does not match " + "number of keys processed. Expected " + + std::to_string(expected) + " but processed " + + std::to_string(actual) + ". Compaction summary: " + scratch; + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] VerifyInputRecordCount() Status: %s", + compact_->compaction->column_family_data()->GetName().c_str(), + job_context_->job_id, msg.c_str()); + if (db_options_.compaction_verify_record_count) { + return Status::Corruption(msg); + } + } + } + return Status::OK(); +} + +Status CompactionJob::VerifyOutputRecordCount() const { + uint64_t total_output_num = 0; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.GetOutputs()) { + total_output_num += output.table_properties->num_entries - + output.table_properties->num_range_deletions; + } + } + + uint64_t expected = internal_stats_.output_level_stats.num_output_records; + if (internal_stats_.has_proximal_level_output) { + expected += internal_stats_.proximal_level_stats.num_output_records; + } + if (expected != total_output_num) { + char scratch[2345]; + compact_->compaction->Summary(scratch, sizeof(scratch)); + std::string msg = + "Number of keys in compaction output SST files does not match " + "number of keys added. Expected " + + std::to_string(expected) + " but there are " + + std::to_string(total_output_num) + + " in output SST files. Compaction summary: " + scratch; + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] VerifyOutputRecordCount() status: %s", + compact_->compaction->column_family_data()->GetName().c_str(), + job_context_->job_id, msg.c_str()); + if (db_options_.compaction_verify_record_count) { + return Status::Corruption(msg); + } + } + return Status::OK(); +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index 730b5ddac945..21486f89538e 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -67,7 +67,7 @@ class SubcompactionState; // if needed. // // CompactionJob has 2 main stats: -// 1. CompactionJobStats compaction_job_stats_ +// 1. CompactionJobStats job_stats_ // CompactionJobStats is a public data structure which is part of Compaction // event listener that rocksdb share the job stats with the user. // Internally it's an aggregation of all the compaction_job_stats from each @@ -81,7 +81,7 @@ class SubcompactionState; // +------------------------+ | // | CompactionJob | | +------------------------+ // | | | | SubcompactionState | -// | compaction_job_stats +-----+ | | +// | job_stats +-----+ | | // | | +--------->| compaction_job_stats | // | | | | | // +------------------------+ | +------------------------+ @@ -98,16 +98,13 @@ class SubcompactionState; // +--------->+ | // +------------------------+ // -// 2. CompactionStatsFull compaction_stats_ +// 2. CompactionStatsFull internal_stats_ // `CompactionStatsFull` is an internal stats about the compaction, which // is eventually sent to `ColumnFamilyData::internal_stats_` and used for // logging and public metrics. // Internally, it's an aggregation of stats_ from each `SubcompactionState`. -// It has 2 parts, normal stats about the main compaction information and -// the penultimate level output stats. -// `SubcompactionState` maintains the CompactionOutputs for normal output and -// the penultimate level output if exists, the per_level stats is -// stored with the outputs. +// It has 2 parts, ordinary output level stats and the proximal level output +// stats. // +---------------------------+ // | SubcompactionState | // | | @@ -119,15 +116,15 @@ class SubcompactionState; // | | | // | | +----------------------+ | // +--------------------------------+ | | | CompactionOutputs | | -// | CompactionJob | | | | (penultimate_level) | | +// | CompactionJob | | | | (proximal_level) | | // | | +--------->| stats_ | | -// | compaction_stats_ | | | | +----------------------+ | +// | internal_stats_ | | | | +----------------------+ | // | +-------------------------+ | | | | | -// | |stats (normal) |------|----+ +---------------------------+ +// | |output_level_stats |------|----+ +---------------------------+ // | +-------------------------+ | | | // | | | | // | +-------------------------+ | | | +---------------------------+ -// | |penultimate_level_stats +------+ | | SubcompactionState | +// | |proximal_level_stats |------+ | | SubcompactionState | // | +-------------------------+ | | | | | // | | | | | +----------------------+ | // | | | | | | CompactionOutputs | | @@ -137,7 +134,7 @@ class SubcompactionState; // | | | // | | +----------------------+ | // | | | CompactionOutputs | | -// | | | (penultimate_level) | | +// | | | (proximal_level) | | // +--------->| stats_ | | // | +----------------------+ | // | | @@ -145,27 +142,31 @@ class SubcompactionState; class CompactionJob { public: - CompactionJob( - int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, - const MutableDBOptions& mutable_db_options, - const FileOptions& file_options, VersionSet* versions, - const std::atomic* shutting_down, LogBuffer* log_buffer, - FSDirectory* db_directory, FSDirectory* output_directory, - FSDirectory* blob_output_directory, Statistics* stats, - InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, JobContext* job_context, - std::shared_ptr table_cache, EventLogger* event_logger, - bool paranoid_file_checks, bool measure_io_stats, - const std::string& dbname, CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri, const std::shared_ptr& io_tracer, - const std::atomic& manual_compaction_canceled, - const std::string& db_id = "", const std::string& db_session_id = "", - std::string full_history_ts_low = "", std::string trim_ts = "", - BlobFileCompletionCallback* blob_callback = nullptr, - int* bg_compaction_scheduled = nullptr, - int* bg_bottom_compaction_scheduled = nullptr); + // Constant false aborted flag, used for compaction service jobs + static const std::atomic kCompactionAbortedFalse; + + CompactionJob(int job_id, Compaction* compaction, + const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, LogBuffer* log_buffer, + FSDirectory* db_directory, FSDirectory* output_directory, + FSDirectory* blob_output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + JobContext* job_context, std::shared_ptr table_cache, + EventLogger* event_logger, bool paranoid_file_checks, + bool measure_io_stats, const std::string& dbname, + CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, + const std::shared_ptr& io_tracer, + const std::atomic& manual_compaction_canceled, + const std::atomic& compaction_aborted, + const std::string& db_id = "", + const std::string& db_session_id = "", + std::string full_history_ts_low = "", std::string trim_ts = "", + BlobFileCompletionCallback* blob_callback = nullptr, + int* bg_compaction_scheduled = nullptr, + int* bg_bottom_compaction_scheduled = nullptr); virtual ~CompactionJob(); @@ -179,9 +180,20 @@ class CompactionJob { // and organizing seqno <-> time info. `known_single_subcompact` is non-null // if we already have a known single subcompaction, with optional key bounds // (currently for executing a remote compaction). + // + // @param compaction_progress Previously saved compaction progress + // to resume from. If empty, compaction starts fresh from the + // beginning. + // + // @param compaction_progress_writer Writer for persisting + // subcompaction progress periodically during compaction + // execution. If nullptr, progress tracking is disabled and compaction + // cannot be resumed later. void Prepare( std::optional, std::optional>> - known_single_subcompact); + known_single_subcompact, + const CompactionProgress& compaction_progress = CompactionProgress{}, + log::Writer* compaction_progress_writer = nullptr); // REQUIRED mutex not held // Launch threads for each subcompaction and wait for them to finish. After @@ -199,23 +211,10 @@ class CompactionJob { IOStatus io_status() const { return io_status_; } protected: - // Update the following stats in compaction_stats_.stats - // - num_input_files_in_non_output_levels - // - num_input_files_in_output_level - // - bytes_read_non_output_levels - // - bytes_read_output_level - // - num_input_records - // - bytes_read_blob - // - num_dropped_records - // - // @param num_input_range_del if non-null, will be set to the number of range - // deletion entries in this compaction input. - // - // Returns true iff compaction_stats_.stats.num_input_records and - // num_input_range_del are calculated successfully. - bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr); - virtual void UpdateCompactionJobStats( - const InternalStats::CompactionStats& stats) const; + void UpdateCompactionJobOutputStatsFromInternalStats( + const Status& status, + const InternalStats::CompactionStatsFull& internal_stats) const; + void LogCompaction(); virtual void RecordCompactionIOStats(); void CleanupCompaction(); @@ -224,7 +223,7 @@ class CompactionJob { void ProcessKeyValueCompaction(SubcompactionState* sub_compact); CompactionState* compact_; - InternalStats::CompactionStatsFull compaction_stats_; + InternalStats::CompactionStatsFull internal_stats_; const ImmutableDBOptions& db_options_; const MutableDBOptions mutable_db_options_copy_; LogBuffer* log_buffer_; @@ -237,11 +236,42 @@ class CompactionJob { IOStatus io_status_; - CompactionJobStats* compaction_job_stats_; + CompactionJobStats* job_stats_; private: friend class CompactionJobTestBase; + // Collect the following stats from input files and table properties + // - num_input_files_in_non_output_levels + // - num_input_files_in_output_level + // - bytes_read_non_output_levels + // - bytes_read_output_level + // - num_input_records + // - bytes_read_blob + // - num_dropped_records + // and set them in internal_stats_.output_level_stats + // + // @param num_input_range_del if non-null, will be set to the number of range + // deletion entries in this compaction input. + // + // If any input file has potentially unreliable num_entries count (old SST + // files - details in implementation), + // job_stats_->has_accurate_num_input_records is set to false. + // + // Returns true iff internal_stats_.output_level_stats.num_input_records and + // num_input_range_del are calculated successfully. + // + // This should be called only once for compactions (not per subcompaction) + bool UpdateInternalStatsFromInputFiles( + uint64_t* num_input_range_del = nullptr); + + void UpdateCompactionJobInputStatsFromInternalStats( + const InternalStats::CompactionStatsFull& internal_stats, + uint64_t num_input_range_del) const; + + Status VerifyInputRecordCount(uint64_t num_input_range_del) const; + Status VerifyOutputRecordCount() const; + // Generates a histogram representing potential divisions of key ranges from // the input. It adds the starting and/or ending keys of certain input files // to the working set and then finds the approximate size of data in between @@ -249,6 +279,10 @@ class CompactionJob { // consecutive groups such that each group has a similar size. void GenSubcompactionBoundaries(); + void MaybeAssignCompactionProgressAndWriter( + const CompactionProgress& compaction_progress, + log::Writer* compaction_progress_writer); + // Get the number of planned subcompactions based on max_subcompactions and // extra reserved resources uint64_t GetSubcompactionsLimit(); @@ -269,18 +303,141 @@ class CompactionJob { // Release all reserved threads and update the compaction limits. void ReleaseSubcompactionResources(); + void InitializeCompactionRun(); + void RunSubcompactions(); + void UpdateTimingStats(uint64_t start_micros); + void RemoveEmptyOutputs(); + void CleanupAbortedSubcompactions(); + bool HasNewBlobFiles() const; + Status CollectSubcompactionErrors(); + Status SyncOutputDirectories(); + Status VerifyOutputFiles(); + void SetOutputTableProperties(); + // Aggregates subcompaction output stats to internal stat, and aggregates + // subcompaction's compaction job stats to the whole entire surrounding + // compaction job stats. + void AggregateSubcompactionOutputAndJobStats(); + Status VerifyCompactionRecordCounts(bool stats_built_from_input_table_prop, + uint64_t num_input_range_del); + void FinalizeCompactionRun(const Status& status, + bool stats_built_from_input_table_prop, + uint64_t num_input_range_del); + CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService( SubcompactionState* sub_compact); + struct CompactionIOStatsSnapshot { + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + }; + + struct SubcompactionKeyBoundaries { + const std::optional start; + const std::optional end; + + // Boundaries without timestamps for read options + std::optional start_without_ts; + std::optional end_without_ts; + + // Timestamp management + static constexpr char kMaxTs[] = + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + std::string max_ts; + Slice ts_slice; + + // Internal key boundaries + IterKey start_ikey; + IterKey end_ikey; + Slice start_internal_key; + Slice end_internal_key; + + // User key boundaries + Slice start_user_key; + Slice end_user_key; + + SubcompactionKeyBoundaries(std::optional start_boundary, + std::optional end_boundary) + : start(start_boundary), end(end_boundary) {} + }; + + struct SubcompactionInternalIterators { + std::unique_ptr raw_input; + std::unique_ptr clip; + std::unique_ptr blob_counter; + std::unique_ptr trim_history_iter; + }; + + bool ShouldUseLocalCompaction(SubcompactionState* sub_compact); + CompactionIOStatsSnapshot InitializeIOStats(); + Status SetupAndValidateCompactionFilter( + SubcompactionState* sub_compact, + const CompactionFilter* configured_compaction_filter, + const CompactionFilter*& compaction_filter, + std::unique_ptr& compaction_filter_from_factory); + void InitializeReadOptionsAndBoundaries( + size_t ts_sz, ReadOptions& read_options, + SubcompactionKeyBoundaries& boundaries); + InternalIterator* CreateInputIterator( + SubcompactionState* sub_compact, ColumnFamilyData* cfd, + SubcompactionInternalIterators& iterators, + SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options); + void CreateBlobFileBuilder( + SubcompactionState* sub_compact, ColumnFamilyData* cfd, + std::unique_ptr& blob_file_builder, + const WriteOptions& write_options); + std::unique_ptr CreateCompactionIterator( + SubcompactionState* sub_compact, ColumnFamilyData* cfd, + InternalIterator* input_iter, const CompactionFilter* compaction_filter, + MergeHelper& merge, std::unique_ptr& blob_file_builder, + const WriteOptions& write_options); + std::pair CreateFileHandlers( + SubcompactionState* sub_compact, SubcompactionKeyBoundaries& boundaries); + Status ProcessKeyValue(SubcompactionState* sub_compact, ColumnFamilyData* cfd, + CompactionIterator* c_iter, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func, + uint64_t& prev_cpu_micros); + void UpdateSubcompactionJobStatsIncrementally( + CompactionIterator* c_iter, CompactionJobStats* compaction_job_stats, + uint64_t cur_cpu_micros, uint64_t& prev_cpu_micros); + void FinalizeSubcompactionJobStats(SubcompactionState* sub_compact, + CompactionIterator* c_iter, + uint64_t start_cpu_micros, + uint64_t prev_cpu_micros, + const CompactionIOStatsSnapshot& io_stats); + Status FinalizeProcessKeyValueStatus(ColumnFamilyData* cfd, + InternalIterator* input_iter, + CompactionIterator* c_iter, + Status status); + Status CleanupCompactionFiles(SubcompactionState* sub_compact, Status status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func); + Status FinalizeBlobFiles(SubcompactionState* sub_compact, + BlobFileBuilder* blob_file_builder, Status status); + void FinalizeSubcompaction(SubcompactionState* sub_compact, Status status, + const CompactionFileOpenFunc& open_file_func, + const CompactionFileCloseFunc& close_file_func, + BlobFileBuilder* blob_file_builder, + CompactionIterator* c_iter, + InternalIterator* input_iter, + uint64_t start_cpu_micros, + uint64_t prev_cpu_micros, + const CompactionIOStatsSnapshot& io_stats); + // update the thread status for starting a compaction. void ReportStartedCompaction(Compaction* compaction); - Status FinishCompactionOutputFile(const Status& input_status, - SubcompactionState* sub_compact, - CompactionOutputs& outputs, - const Slice& next_table_min_key, - const Slice* comp_start_user_key, - const Slice* comp_end_user_key); + Status FinishCompactionOutputFile( + const Status& input_status, + const ParsedInternalKey& prev_iter_output_internal_key, + const Slice& next_table_min_key, const Slice* comp_start_user_key, + const Slice* comp_end_user_key, const CompactionIterator* c_iter, + SubcompactionState* sub_compact, CompactionOutputs& outputs); Status InstallCompactionResults(bool* compaction_released); Status OpenCompactionOutputFile(SubcompactionState* sub_compact, CompactionOutputs& outputs); @@ -308,25 +465,13 @@ class CompactionJob { VersionSet* versions_; const std::atomic* shutting_down_; const std::atomic& manual_compaction_canceled_; + const std::atomic& compaction_aborted_; FSDirectory* db_directory_; FSDirectory* blob_output_directory_; InstrumentedMutex* db_mutex_; ErrorHandler* db_error_handler_; - // If there were two snapshots with seq numbers s1 and - // s2 and s1 < s2, and if we find two instances of a key k1 then lies - // entirely within s1 and s2, then the earlier version of k1 can be safely - // deleted because that version is not visible in any snapshot. - std::vector existing_snapshots_; SequenceNumber earliest_snapshot_; - - // This is the earliest snapshot that could be used for write-conflict - // checking by a transaction. For any user-key newer than this snapshot, we - // should make sure not to remove evidence that a write occurred. - SequenceNumber earliest_write_conflict_snapshot_; - - const SnapshotChecker* const snapshot_checker_; - JobContext* job_context_; std::shared_ptr table_cache_; @@ -363,13 +508,16 @@ class CompactionJob { // Minimal sequence number to preclude the data from the last level. If the // key has bigger (newer) sequence number than this, it will be precluded from - // the last level (output to penultimate level). - SequenceNumber penultimate_after_seqno_ = kMaxSequenceNumber; + // the last level (output to proximal level). + SequenceNumber proximal_after_seqno_ = kMaxSequenceNumber; // Options File Number used for Remote Compaction // Setting this requires DBMutex. uint64_t options_file_number_ = 0; + // Writer for persisting compaction progress during compaction + log::Writer* compaction_progress_writer_ = nullptr; + // Get table file name in where it's outputting to, which should also be in // `output_directory_`. virtual std::string GetTableFileName(uint64_t file_number); @@ -377,6 +525,43 @@ class CompactionJob { // The Compaction Read and Write priorities are the same for different // scenarios, such as write stalled. Env::IOPriority GetRateLimiterPriority(); + + Status MaybeResumeSubcompactionProgressOnInputIterator( + SubcompactionState* sub_compact, InternalIterator* input_iter); + + Status ReadOutputFilesTableProperties( + const autovector& temporary_output_file_allocation, + const ReadOptions& read_options, + std::vector>& + output_files_table_properties, + bool is_proximal_level = false); + + Status ReadTablePropertiesDirectly( + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, + const FileMetaData* file_meta, const ReadOptions& read_options, + std::shared_ptr* tp); + + void RestoreCompactionOutputs( + const ColumnFamilyData* cfd, + const std::vector>& + output_files_table_properties, + SubcompactionProgressPerLevel& subcompaction_progress_per_level, + CompactionOutputs* outputs_to_restore); + + bool ShouldUpdateSubcompactionProgress( + const SubcompactionState* sub_compact, const CompactionIterator* c_iter, + const ParsedInternalKey& prev_iter_output_internal_key, + const Slice& next_table_min_internal_key, const FileMetaData* meta) const; + + void UpdateSubcompactionProgress(const CompactionIterator* c_iter, + const Slice next_table_min_key, + SubcompactionState* sub_compact); + + Status PersistSubcompactionProgress(SubcompactionState* sub_compact); + + void UpdateSubcompactionProgressPerLevel( + SubcompactionState* sub_compact, bool is_proximal_level, + SubcompactionProgress& subcompaction_progress); }; // CompactionServiceInput is used the pass compaction information between two @@ -418,8 +603,9 @@ struct CompactionServiceInput { // CompactionServiceOutputFile is the metadata for the output SST file struct CompactionServiceOutputFile { std::string file_name; - SequenceNumber smallest_seqno; - SequenceNumber largest_seqno; + uint64_t file_size{}; + SequenceNumber smallest_seqno{}; + SequenceNumber largest_seqno{}; std::string smallest_internal_key; std::string largest_internal_key; uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; @@ -427,21 +613,26 @@ struct CompactionServiceOutputFile { uint64_t epoch_number = kUnknownEpochNumber; std::string file_checksum = kUnknownFileChecksum; std::string file_checksum_func_name = kUnknownFileChecksumFuncName; - uint64_t paranoid_hash; + uint64_t paranoid_hash{}; bool marked_for_compaction; UniqueId64x2 unique_id{}; TableProperties table_properties; + bool is_proximal_level_output; + Temperature file_temperature = Temperature::kUnknown; CompactionServiceOutputFile() = default; CompactionServiceOutputFile( - const std::string& name, SequenceNumber smallest, SequenceNumber largest, - std::string _smallest_internal_key, std::string _largest_internal_key, - uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - uint64_t _epoch_number, const std::string& _file_checksum, + const std::string& name, uint64_t size, SequenceNumber smallest, + SequenceNumber largest, std::string _smallest_internal_key, + std::string _largest_internal_key, uint64_t _oldest_ancester_time, + uint64_t _file_creation_time, uint64_t _epoch_number, + const std::string& _file_checksum, const std::string& _file_checksum_func_name, uint64_t _paranoid_hash, bool _marked_for_compaction, UniqueId64x2 _unique_id, - const TableProperties& _table_properties) + const TableProperties& _table_properties, bool _is_proximal_level_output, + Temperature _file_temperature) : file_name(name), + file_size(size), smallest_seqno(smallest), largest_seqno(largest), smallest_internal_key(std::move(_smallest_internal_key)), @@ -454,7 +645,9 @@ struct CompactionServiceOutputFile { paranoid_hash(_paranoid_hash), marked_for_compaction(_marked_for_compaction), unique_id(std::move(_unique_id)), - table_properties(_table_properties) {} + table_properties(_table_properties), + is_proximal_level_output(_is_proximal_level_output), + file_temperature(_file_temperature) {} }; // CompactionServiceResult contains the compaction result from a different db @@ -470,8 +663,21 @@ struct CompactionServiceResult { uint64_t bytes_read = 0; uint64_t bytes_written = 0; + + // Job-level Compaction Stats. + // + // NOTE: Job level stats cannot be rebuilt from scratch by simply aggregating + // per-level stats due to some fields populated directly during compaction + // (e.g. RecordDroppedKeys()). This is why we need both job-level stats and + // per-level in the serialized result. If rebuilding job-level stats from + // per-level stats become possible in the future, consider deprecating this + // field. CompactionJobStats stats; + // Per-level Compaction Stats for both output_level_stats and + // proximal_level_stats + InternalStats::CompactionStatsFull internal_stats; + // serialization interface to read and write the object static Status Read(const std::string& data_str, CompactionServiceResult* obj); Status Write(std::string* output); @@ -494,9 +700,9 @@ class CompactionServiceCompactionJob : private CompactionJob { const std::atomic* shutting_down, LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, - std::vector existing_snapshots, - std::shared_ptr table_cache, EventLogger* event_logger, - const std::string& dbname, const std::shared_ptr& io_tracer, + JobContext* job_context, std::shared_ptr table_cache, + EventLogger* event_logger, const std::string& dbname, + const std::shared_ptr& io_tracer, const std::atomic& manual_compaction_canceled, const std::string& db_id, const std::string& db_session_id, std::string output_path, @@ -505,7 +711,9 @@ class CompactionServiceCompactionJob : private CompactionJob { // REQUIRED: mutex held // Like CompactionJob::Prepare() - void Prepare(); + void Prepare( + const CompactionProgress& compaction_progress = CompactionProgress{}, + log::Writer* compaction_progress_writer = nullptr); // Run the compaction in current thread and return the result Status Run(); @@ -517,9 +725,6 @@ class CompactionServiceCompactionJob : private CompactionJob { protected: void RecordCompactionIOStats() override; - void UpdateCompactionJobStats( - const InternalStats::CompactionStats& stats) const override; - private: // Get table file name in output_path std::string GetTableFileName(uint64_t file_number) override; diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc index c4a05c951dfc..6a91271520d0 100644 --- a/db/compaction/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -82,7 +82,7 @@ class CompactionJobStatsTest : public testing::Test, std::string dbname_; std::string alternative_wal_dir_; Env* env_; - DB* db_; + std::unique_ptr db_; std::vector handles_; uint32_t max_subcompactions_; @@ -123,7 +123,7 @@ class CompactionJobStatsTest : public testing::Test, static void SetUpTestCase() {} static void TearDownTestCase() {} - DBImpl* dbfull() { return static_cast_with_check(db_); } + DBImpl* dbfull() { return static_cast_with_check(db_.get()); } void CreateColumnFamilies(const std::vector& cfs, const Options& options) { @@ -162,7 +162,8 @@ class CompactionJobStatsTest : public testing::Test, column_families.emplace_back(cfs[i], options[i]); } DBOptions db_opts = DBOptions(options[0]); - return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + auto s = DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + return s; } Status TryReopenWithColumnFamilies(const std::vector& cfs, @@ -179,8 +180,7 @@ class CompactionJobStatsTest : public testing::Test, delete h; } handles_.clear(); - delete db_; - db_ = nullptr; + db_.reset(); } void DestroyAndReopen(const Options& options) { @@ -743,7 +743,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { } ASSERT_OK(Flush(1)); - ASSERT_OK(static_cast_with_check(db_)->TEST_WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); stats_checker->set_verify_next_comp_io_stats(true); std::atomic first_prepare_write(true); @@ -944,7 +944,7 @@ TEST_P(CompactionJobStatsTest, UniversalCompactionTest) { start_key += key_base) { MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, kValueSize, key_interval, compression_ratio, 1); - ASSERT_OK(static_cast_with_check(db_)->TEST_WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); } diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 1108223a6f29..7a6f77ee222a 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -17,6 +17,7 @@ #include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "db/version_set.h" +#include "file/filename.h" #include "file/random_access_file_reader.h" #include "file/writable_file_writer.h" #include "options/options_helper.h" @@ -43,7 +44,6 @@ void VerifyInitializationOfCompactionJobStats( ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U); ASSERT_EQ(compaction_job_stats.num_input_records, 0U); - ASSERT_EQ(compaction_job_stats.num_input_files, 0U); ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U); ASSERT_EQ(compaction_job_stats.num_output_records, 0U); @@ -52,7 +52,6 @@ void VerifyInitializationOfCompactionJobStats( ASSERT_TRUE(compaction_job_stats.is_manual_compaction); ASSERT_FALSE(compaction_job_stats.is_remote_compaction); - ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U); ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U); ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U); @@ -212,12 +211,12 @@ class CompactionJobTestBase : public testing::Test { table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)), + /*error_handler=*/nullptr, /*unchanging=*/false)), shutting_down_(false), mock_table_factory_(new mock::MockTableFactory()), error_handler_(nullptr, db_options_, &mutex_), @@ -460,9 +459,10 @@ class CompactionJobTestBase : public testing::Test { ReadOptions read_opts; Status s = cf_options_.table_factory->NewTableReader( read_opts, - TableReaderOptions(cfd->ioptions(), nullptr, FileOptions(), + TableReaderOptions(cfd->ioptions(), /*prefix_extractor=*/nullptr, + /*compression_manager=*/nullptr, FileOptions(), cfd_->internal_comparator(), - 0 /* block_protection_bytes_per_key */), + /*block_protection_bytes_per_key=*/0), std::move(freader), file_size, &table_reader, false); ASSERT_OK(s); assert(table_reader); @@ -546,13 +546,13 @@ class CompactionJobTestBase : public testing::Test { ASSERT_OK(s); db_options_.info_log = info_log; - versions_.reset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - test::kUnitTestDbId, /*db_session_id=*/"", - /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + versions_.reset(new VersionSet( + dbname_, &db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + test::kUnitTestDbId, /*db_session_id=*/"", + /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr, /*unchanging=*/false)); compaction_job_stats_.Reset(); VersionEdit new_db; @@ -595,11 +595,11 @@ class CompactionJobTestBase : public testing::Test { const std::vector>& input_files, const std::vector input_levels, std::function&& verify_func, - const std::vector& snapshots = {}) { + std::vector&& snapshots = {}) { const int kLastLevel = cf_options_.num_levels - 1; verify_per_key_placement_ = std::move(verify_func); mock::KVVector empty_map; - RunCompaction(input_files, input_levels, {empty_map}, snapshots, + RunCompaction(input_files, input_levels, {empty_map}, std::move(snapshots), kMaxSequenceNumber, kLastLevel, false); } @@ -608,7 +608,7 @@ class CompactionJobTestBase : public testing::Test { const std::vector>& input_files, const std::vector& input_levels, const std::vector& expected_results, - const std::vector& snapshots = {}, + std::vector&& snapshots = {}, SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, int output_level = 1, bool verify = true, std::vector expected_oldest_blob_file_numbers = {}, @@ -652,7 +652,8 @@ class CompactionJobTestBase : public testing::Test { mutable_cf_options_.max_compaction_bytes, 0, kNoCompression, cfd->GetLatestMutableCFOptions().compression_opts, Temperature::kUnknown, max_subcompactions, grandparents, - /*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr, true); + /*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr, + CompactionReason::kManualCompaction); compaction.FinalizeInputInfo(cfd->current()); assert(db_options_.info_log); @@ -665,16 +666,18 @@ class CompactionJobTestBase : public testing::Test { ucmp_->timestamp_size() == full_history_ts_low_.size()); const std::atomic kManualCompactionCanceledFalse{false}; JobContext job_context(1, false /* create_superversion */); + job_context.InitSnapshotContext(snapshot_checker, nullptr, + earliest_write_conflict_snapshot, + std::move(snapshots)); CompactionJob compaction_job( 0, &compaction, db_options_, mutable_db_options_, env_options_, versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr, - nullptr, nullptr, &mutex_, &error_handler_, snapshots, - earliest_write_conflict_snapshot, snapshot_checker, &job_context, - table_cache_, &event_logger, false, false, dbname_, - &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */, + nullptr, nullptr, &mutex_, &error_handler_, &job_context, table_cache_, + &event_logger, false, false, dbname_, &compaction_job_stats_, + Env::Priority::USER, nullptr /* IOTracer */, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, - env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr), - full_history_ts_low_); + CompactionJob::kCompactionAbortedFalse, env_->GenerateUniqueId(), + DBImpl::GenerateDbSessionId(nullptr), full_history_ts_low_); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); compaction_job.Prepare(std::nullopt /*subcompact to be computed*/); @@ -1474,7 +1477,7 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) { /* expected_oldest_blob_file_numbers */ {19}); } -TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { +TEST_F(CompactionJobTest, VerifyProximalLevelOutput) { cf_options_.last_level_temperature = Temperature::kCold; SyncPoint::GetInstance()->SetCallBack( "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { @@ -1487,8 +1490,7 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { SyncPoint::GetInstance()->SetCallBack( "CompactionIterator::PrepareOutput.context", [&](void* arg) { auto context = static_cast(arg); - context->output_to_penultimate_level = - context->seq_num > latest_cold_seq; + context->output_to_proximal_level = context->seq_num > latest_cold_seq; }); SyncPoint::GetInstance()->EnableProcessing(); @@ -1534,11 +1536,11 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { /*verify_func=*/[&](Compaction& comp) { for (char c = 'a'; c <= 'z'; c++) { if (c == 'a') { - comp.TEST_AssertWithinPenultimateLevelOutputRange( + comp.TEST_AssertWithinProximalLevelOutputRange( "a", true /*expect_failure*/); } else { std::string c_str{c}; - comp.TEST_AssertWithinPenultimateLevelOutputRange(c_str); + comp.TEST_AssertWithinProximalLevelOutputRange(c_str); } } }); @@ -1670,6 +1672,7 @@ TEST_F(CompactionJobTest, ResultSerialization) { UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)}; result.output_files.emplace_back( rnd.RandomString(rnd.Uniform(kStrMaxLen)) /* file_name */, + rnd64.Uniform(UINT64_MAX) /* file_size */, rnd64.Uniform(UINT64_MAX) /* smallest_seqno */, rnd64.Uniform(UINT64_MAX) /* largest_seqno */, rnd.RandomBinaryString( @@ -1682,7 +1685,8 @@ TEST_F(CompactionJobTest, ResultSerialization) { file_checksum /* file_checksum */, file_checksum_func_name /* file_checksum_func_name */, rnd64.Uniform(UINT64_MAX) /* paranoid_hash */, - rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp); + rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp, + false /* is_proximal_level_output */, Temperature::kHot); } result.output_level = rnd.Uniform(10); result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); @@ -1736,6 +1740,8 @@ TEST_F(CompactionJobTest, ResultSerialization) { ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum, file_checksum); ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum_func_name, file_checksum_func_name); + ASSERT_EQ(deserialized_tmp.output_files[0].file_temperature, + Temperature::kHot); } // Test unknown field @@ -2033,7 +2039,7 @@ TEST_F(CompactionJobTest, CutToAlignGrandparentBoundarySameKey) { snapshots.emplace_back(i); } RunCompaction({lvl0_files, lvl1_files}, input_levels, - {expected_file1, expected_file2}, snapshots); + {expected_file1, expected_file2}, std::move(snapshots)); } TEST_F(CompactionJobTest, CutForMaxCompactionBytesSameKey) { @@ -2092,7 +2098,8 @@ TEST_F(CompactionJobTest, CutForMaxCompactionBytesSameKey) { snapshots.emplace_back(i); } RunCompaction({lvl0_files, lvl1_files}, input_levels, - {expected_file1, expected_file2, expected_file3}, snapshots); + {expected_file1, expected_file2, expected_file3}, + std::move(snapshots)); } class CompactionJobTimestampTest : public CompactionJobTestBase { @@ -2402,7 +2409,6 @@ TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) { kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true, Env::IO_LOW, Env::IO_LOW); } - } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index 3e1c4402cea3..8c86df870dee 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -49,12 +49,16 @@ Status CompactionOutputs::Finish( meta->fd.file_size = current_bytes; meta->tail_size = builder_->GetTailSize(); meta->marked_for_compaction = builder_->NeedCompact(); - meta->user_defined_timestamps_persisted = static_cast( - builder_->GetTableProperties().user_defined_timestamps_persisted); + const TableProperties& tp = builder_->GetTableProperties(); + meta->user_defined_timestamps_persisted = + static_cast(tp.user_defined_timestamps_persisted); + ExtractTimestampFromTableProperties(tp, meta); } current_output().finished = true; stats_.bytes_written += current_bytes; - stats_.num_output_files = outputs_.size(); + stats_.bytes_written_pre_comp += builder_->PreCompressionSize(); + stats_.num_output_files = static_cast(outputs_.size()); + worker_cpu_micros_ += builder_->GetWorkerCPUMicros(); return s; } @@ -276,7 +280,11 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { } // reach the max file size - if (current_output_file_size_ >= compaction_->max_output_file_size()) { + uint64_t estimated_file_size = current_output_file_size_; + if (compaction_->mutable_cf_options().target_file_size_is_upper_bound) { + estimated_file_size += builder_->EstimatedTailSize(); + } + if (estimated_file_size >= compaction_->max_output_file_size()) { return true; } @@ -357,7 +365,8 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { Status CompactionOutputs::AddToOutput( const CompactionIterator& c_iter, const CompactionFileOpenFunc& open_file_func, - const CompactionFileCloseFunc& close_file_func) { + const CompactionFileCloseFunc& close_file_func, + const ParsedInternalKey& prev_iter_output_internal_key) { Status s; bool is_range_del = c_iter.IsDeleteRangeSentinelKey(); if (is_range_del && compaction_->bottommost_level()) { @@ -368,7 +377,8 @@ Status CompactionOutputs::AddToOutput( } const Slice& key = c_iter.key(); if (ShouldStopBefore(c_iter) && HasBuilder()) { - s = close_file_func(*this, c_iter.InputStatus(), key); + s = close_file_func(c_iter.InputStatus(), prev_iter_output_internal_key, + key, &c_iter, *this); if (!s.ok()) { return s; } @@ -792,8 +802,8 @@ void CompactionOutputs::FillFilesToCutForTtl() { } CompactionOutputs::CompactionOutputs(const Compaction* compaction, - const bool is_penultimate_level) - : compaction_(compaction), is_penultimate_level_(is_penultimate_level) { + const bool is_proximal_level) + : compaction_(compaction), is_proximal_level_(is_proximal_level) { partitioner_ = compaction->output_level() == 0 ? nullptr : compaction->CreateSstPartitioner(); diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 33259be4670a..757e1b6b85ed 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -21,7 +21,8 @@ namespace ROCKSDB_NAMESPACE { class CompactionOutputs; using CompactionFileOpenFunc = std::function; using CompactionFileCloseFunc = - std::function; + std::function; // Files produced by subcompaction, most of the functions are used by // compaction_job Open/Close compaction file functions. @@ -30,31 +31,36 @@ class CompactionOutputs { // compaction output file struct Output { Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, - bool _enable_hash, bool _finished, uint64_t precalculated_hash) + bool _enable_hash, bool _finished, uint64_t precalculated_hash, + bool _is_proximal_level) : meta(std::move(_meta)), validator(_icmp, _enable_hash, precalculated_hash), - finished(_finished) {} + finished(_finished), + is_proximal_level(_is_proximal_level) {} FileMetaData meta; OutputValidator validator; bool finished; + bool is_proximal_level; std::shared_ptr table_properties; }; CompactionOutputs() = delete; explicit CompactionOutputs(const Compaction* compaction, - const bool is_penultimate_level); + const bool is_proximal_level); - bool IsPenultimateLevel() const { return is_penultimate_level_; } + bool IsProximalLevel() const { return is_proximal_level_; } // Add generated output to the list void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp, bool enable_hash, bool finished = false, uint64_t precalculated_hash = 0) { outputs_.emplace_back(std::move(meta), icmp, enable_hash, finished, - precalculated_hash); + precalculated_hash, is_proximal_level_); } + const std::vector& GetOutputs() const { return outputs_; } + // Set new table builder for the current output void NewBuilder(const TableBuilderOptions& tboptions); @@ -63,34 +69,42 @@ class CompactionOutputs { file_writer_.reset(writer); } - // TODO: Remove it when remote compaction support tiered compaction - void AddBytesWritten(uint64_t bytes) { stats_.bytes_written += bytes; } - void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; } - void SetNumOutputFiles(uint64_t num) { stats_.num_output_files = num; } - // TODO: Move the BlobDB builder into CompactionOutputs const std::vector& GetBlobFileAdditions() const { - if (is_penultimate_level_) { + if (is_proximal_level_) { assert(blob_file_additions_.empty()); } return blob_file_additions_; } std::vector* GetBlobFileAdditionsPtr() { - assert(!is_penultimate_level_); + assert(!is_proximal_level_); return &blob_file_additions_; } bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); } + // Get all file paths (SST and blob) created during compaction. + const std::vector& GetOutputFilePaths() const { + return output_file_paths_; + } + + std::vector* GetOutputFilePathsPtr() { + return &output_file_paths_; + } + + void AddOutputFilePath(const std::string& path) { + output_file_paths_.push_back(path); + } + BlobGarbageMeter* CreateBlobGarbageMeter() { - assert(!is_penultimate_level_); + assert(!is_proximal_level_); blob_garbage_meter_ = std::make_unique(); return blob_garbage_meter_.get(); } BlobGarbageMeter* GetBlobGarbageMeter() const { - if (is_penultimate_level_) { + if (is_proximal_level_) { // blobdb doesn't support per_key_placement yet assert(blob_garbage_meter_ == nullptr); return nullptr; @@ -99,8 +113,9 @@ class CompactionOutputs { } void UpdateBlobStats() { - assert(!is_penultimate_level_); - stats_.num_output_files_blob = blob_file_additions_.size(); + assert(!is_proximal_level_); + stats_.num_output_files_blob = + static_cast(blob_file_additions_.size()); for (const auto& blob : blob_file_additions_) { stats_.bytes_written_blob += blob.GetTotalBlobBytes(); } @@ -169,6 +184,10 @@ class CompactionOutputs { uint64_t NumEntries() const { return builder_->NumEntries(); } + uint64_t GetWorkerCPUMicros() const { + return worker_cpu_micros_ + (builder_ ? builder_->GetWorkerCPUMicros() : 0); + } + void ResetBuilder() { builder_.reset(); current_output_file_size_ = 0; @@ -192,6 +211,10 @@ class CompactionOutputs { std::pair keep_seqno_range, const Slice& next_table_min_key, const std::string& full_history_ts_low); + void SetNumOutputRecords(uint64_t num_output_records) { + stats_.num_output_records = num_output_records; + } + private: friend class SubcompactionState; @@ -251,7 +274,8 @@ class CompactionOutputs { // close and open new compaction output with the functions provided. Status AddToOutput(const CompactionIterator& c_iter, const CompactionFileOpenFunc& open_file_func, - const CompactionFileCloseFunc& close_file_func); + const CompactionFileCloseFunc& close_file_func, + const ParsedInternalKey& prev_iter_output_internal_key); // Close the current output. `open_file_func` is needed for creating new file // for range-dels only output file. @@ -267,9 +291,12 @@ class CompactionOutputs { !range_del_agg->IsEmpty()) { status = open_file_func(*this); } + if (HasBuilder()) { + const ParsedInternalKey empty_internal_key{}; const Slice empty_key{}; - Status s = close_file_func(*this, status, empty_key); + Status s = close_file_func(status, empty_internal_key, empty_key, + nullptr /* c_iter */, *this); if (!s.ok() && status.ok()) { status = s; } @@ -297,6 +324,9 @@ class CompactionOutputs { uint64_t current_output_file_size_ = 0; SequenceNumber smallest_preferred_seqno_ = kMaxSequenceNumber; + // Sum of all the GetWorkerCPUMicros() for all the closed builders so far. + uint64_t worker_cpu_micros_ = 0; + // all the compaction outputs so far std::vector outputs_; @@ -304,12 +334,18 @@ class CompactionOutputs { std::vector blob_file_additions_; std::unique_ptr blob_garbage_meter_; - // Basic compaction output stats for this level's outputs - InternalStats::CompactionOutputsStats stats_; + // All file paths (SST and blob) created during compaction. + // Used for cleanup on abort - ensures orphan files are deleted even if + // they were removed from outputs_ or blob_file_additions_ (e.g., by + // RemoveLastEmptyOutput when file_size is 0 because builder was abandoned). + std::vector output_file_paths_; + + // Per level's output stat + InternalStats::CompactionStats stats_; - // indicate if this CompactionOutputs obj for penultimate_level, should always + // indicate if this CompactionOutputs obj for proximal_level, should always // be false if per_key_placement feature is not enabled. - const bool is_penultimate_level_; + const bool is_proximal_level_; // partitioner information std::string last_key_for_partitioner_; @@ -363,7 +399,7 @@ class CompactionOutputs { std::vector level_ptrs_; }; -// helper struct to concatenate the last level and penultimate level outputs +// helper struct to concatenate the last level and proximal level outputs // which could be replaced by std::ranges::join_view() in c++20 struct OutputIterator { public: diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 946dab5ddefe..14c25677c0b9 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -27,12 +27,68 @@ namespace ROCKSDB_NAMESPACE { -bool FindIntraL0Compaction(const std::vector& level_files, - size_t min_files_to_compact, - uint64_t max_compact_bytes_per_del_file, - uint64_t max_compaction_bytes, - CompactionInputFiles* comp_inputs) { - TEST_SYNC_POINT("FindIntraL0Compaction"); +#ifndef NDEBUG +static void AssertCleanCut(const InternalKeyComparator* icmp, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, int level, + Logger* logger) { + const std::vector& level_files = vstorage->LevelFiles(level); + if (inputs->files.empty() || level_files.empty()) { + return; + } + + const Comparator* ucmp = icmp->user_comparator(); + + // Find first and last input file indices in level + int first_input_idx = -1; + int last_input_idx = -1; + for (size_t i = 0; i < level_files.size(); i++) { + if (level_files[i] == inputs->files.front()) { + first_input_idx = static_cast(i); + } + if (level_files[i] == inputs->files.back()) { + last_input_idx = static_cast(i); + } + } + + // Check file before first input + if (first_input_idx > 0) { + const FileMetaData* prev_file = level_files[first_input_idx - 1]; + const FileMetaData* first_file = inputs->files.front(); + int cmp = sstableKeyCompare(ucmp, prev_file->largest, first_file->smallest); + if (cmp == 0) { + ROCKS_LOG_ERROR(logger, + "Clean cut violated: L%d unselected file %" PRIu64 + " adjacent to first selected file %" PRIu64, + level, prev_file->fd.GetNumber(), + first_file->fd.GetNumber()); + assert(false); + } + } + + // Check file after last input + if (last_input_idx >= 0 && + static_cast(last_input_idx) < level_files.size() - 1) { + const FileMetaData* last_file = inputs->files.back(); + const FileMetaData* next_file = level_files[last_input_idx + 1]; + int cmp = sstableKeyCompare(ucmp, last_file->largest, next_file->smallest); + if (cmp == 0) { + ROCKS_LOG_ERROR(logger, + "Clean cut violated: L%d unselected file %" PRIu64 + " adjacent to last selected file %" PRIu64, + level, next_file->fd.GetNumber(), + last_file->fd.GetNumber()); + assert(false); + } + } +} +#endif // NDEBUG + +bool PickCostBasedIntraL0Compaction( + const std::vector& level_files, size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs) { + TEST_SYNC_POINT("PickCostBasedIntraL0Compaction"); size_t start = 0; @@ -242,7 +298,7 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, GetRange(*inputs, &smallest, &largest); inputs->clear(); vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files, - hint_index, &hint_index, true, + hint_index, &hint_index, true, nullptr, next_smallest); } while (inputs->size() > old_size); @@ -250,6 +306,10 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, // inputs. thus, inputs should be non-empty here assert(!inputs->empty()); +#ifndef NDEBUG + AssertCleanCut(icmp_, vstorage, inputs, level, ioptions_.logger); +#endif // NDEBUG + // If, after the expansion, there are files that are already under // compaction, then we must drop/cancel this compaction. if (AreFilesInCompaction(inputs->files)) { @@ -272,8 +332,8 @@ bool CompactionPicker::RangeOverlapWithCompaction( return true; } if (c->SupportsPerKeyPlacement()) { - if (c->OverlapPenultimateLevelOutputRange(smallest_user_key, - largest_user_key)) { + if (c->OverlapProximalLevelOutputRange(smallest_user_key, + largest_user_key)) { return true; } } @@ -284,7 +344,7 @@ bool CompactionPicker::RangeOverlapWithCompaction( bool CompactionPicker::FilesRangeOverlapWithCompaction( const std::vector& inputs, int level, - int penultimate_level) const { + int proximal_level) const { bool is_empty = true; for (auto& in : inputs) { if (!in.empty()) { @@ -301,18 +361,18 @@ bool CompactionPicker::FilesRangeOverlapWithCompaction( // files cannot be overlapped in the order of L0 files. InternalKey smallest, largest; GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel); - if (penultimate_level != Compaction::kInvalidLevel) { + if (proximal_level != Compaction::kInvalidLevel) { if (ioptions_.compaction_style == kCompactionStyleUniversal) { if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), - penultimate_level)) { + proximal_level)) { return true; } } else { - InternalKey penultimate_smallest, penultimate_largest; - GetRange(inputs, &penultimate_smallest, &penultimate_largest, level); - if (RangeOverlapWithCompaction(penultimate_smallest.user_key(), - penultimate_largest.user_key(), - penultimate_level)) { + InternalKey proximal_smallest, proximal_largest; + GetRange(inputs, &proximal_smallest, &proximal_largest, level); + if (RangeOverlapWithCompaction(proximal_smallest.user_key(), + proximal_largest.user_key(), + proximal_level)) { return true; } } @@ -333,11 +393,13 @@ bool CompactionPicker::AreFilesInCompaction( return false; } -Compaction* CompactionPicker::CompactFiles( +Compaction* CompactionPicker::PickCompactionForCompactFiles( const CompactionOptions& compact_options, const std::vector& input_files, int output_level, VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, uint32_t output_path_id) { + const MutableDBOptions& mutable_db_options, uint32_t output_path_id, + std::optional earliest_snapshot, + const SnapshotChecker* snapshot_checker) { #ifndef NDEBUG assert(input_files.size()); // This compaction output should not overlap with a running compaction as @@ -353,7 +415,7 @@ Compaction* CompactionPicker::CompactFiles( } assert(output_level == 0 || !FilesRangeOverlapWithCompaction( input_files, output_level, - Compaction::EvaluatePenultimateLevel( + Compaction::EvaluateProximalLevel( vstorage, mutable_cf_options, ioptions_, start_level, output_level))); #endif /* !NDEBUG */ @@ -373,15 +435,16 @@ Compaction* CompactionPicker::CompactFiles( // without configurable `CompressionOptions`, which is inconsistent. compression_type = compact_options.compression; } + auto c = new Compaction( vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files, output_level, compact_options.output_file_size_limit, mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, GetCompressionOptions(mutable_cf_options, vstorage, output_level), - mutable_cf_options.default_write_temperature, + compact_options.output_temperature_override, compact_options.max_subcompactions, - /* grandparents */ {}, /* earliest_snapshot */ std::nullopt, - /* snapshot_checker */ nullptr, true); + /* grandparents */ {}, earliest_snapshot, snapshot_checker, + CompactionReason::kManualCompaction); RegisterCompaction(c); return c; } @@ -462,7 +525,8 @@ bool CompactionPicker::SetupOtherInputs( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, CompactionInputFiles* inputs, CompactionInputFiles* output_level_inputs, int* parent_index, - int base_index, bool only_expand_towards_right) { + int base_index, bool only_expand_towards_right, + const FileMetaData* starting_l0_file) { assert(!inputs->empty()); assert(output_level_inputs->empty()); const int input_level = inputs->level; @@ -518,11 +582,11 @@ bool CompactionPicker::SetupOtherInputs( // Round-robin compaction only allows expansion towards the larger side. vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit, &expanded_inputs.files, base_index, - nullptr); + nullptr, true, starting_l0_file); } else { vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit, &expanded_inputs.files, base_index, - nullptr); + nullptr, true, starting_l0_file); } uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files); if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) { @@ -601,13 +665,14 @@ void CompactionPicker::GetGrandparents( } } -Compaction* CompactionPicker::CompactRange( +Compaction* CompactionPicker::PickCompactionForCompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, - uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + uint64_t max_file_num_to_ignore, const std::string& trim_ts, + const std::string& full_history_ts_low) { // CompactionPickerFIFO has its own implementation of compact range assert(ioptions_.compaction_style != kCompactionStyleFIFO); @@ -617,8 +682,8 @@ Compaction* CompactionPicker::CompactRange( // Universal compaction with more than one level always compacts all the // files together to the last level. assert(vstorage->num_levels() > 1); - int max_output_level = - vstorage->MaxOutputLevel(ioptions_.allow_ingest_behind); + int max_output_level = vstorage->MaxOutputLevel( + ioptions_.cf_allow_ingest_behind || ioptions_.allow_ingest_behind); // DBImpl::CompactRange() set output level to be the last level assert(output_level == max_output_level); // DBImpl::RunManualCompaction will make full range for universal compaction @@ -659,9 +724,9 @@ Compaction* CompactionPicker::CompactRange( // overlaping outputs in the same level. if (FilesRangeOverlapWithCompaction( inputs, output_level, - Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options, - ioptions_, start_level, - output_level))) { + Compaction::EvaluateProximalLevel(vstorage, mutable_cf_options, + ioptions_, start_level, + output_level))) { // This compaction output could potentially conflict with the output // of a currently running compaction, we cannot run it. *manual_conflict = true; @@ -677,18 +742,17 @@ Compaction* CompactionPicker::CompactRange( compact_range_options.target_path_id, GetCompressionType(vstorage, mutable_cf_options, output_level, 1), GetCompressionOptions(mutable_cf_options, vstorage, output_level), - mutable_cf_options.default_write_temperature, - compact_range_options.max_subcompactions, + Temperature::kUnknown, compact_range_options.max_subcompactions, /* grandparents */ {}, /* earliest_snapshot */ std::nullopt, - /* snapshot_checker */ nullptr, - /* is manual */ true, trim_ts, /* score */ -1, - /* deletion_compaction */ false, /* l0_files_might_overlap */ true, - CompactionReason::kUnknown, + /* snapshot_checker */ nullptr, CompactionReason::kManualCompaction, + trim_ts, /* score */ -1, + /* l0_files_might_overlap */ true, compact_range_options.blob_garbage_collection_policy, compact_range_options.blob_garbage_collection_age_cutoff); RegisterCompaction(c); - vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options, + full_history_ts_low); return c; } @@ -848,9 +912,9 @@ Compaction* CompactionPicker::CompactRange( // overlaping outputs in the same level. if (FilesRangeOverlapWithCompaction( compaction_inputs, output_level, - Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options, - ioptions_, input_level, - output_level))) { + Compaction::EvaluateProximalLevel(vstorage, mutable_cf_options, + ioptions_, input_level, + output_level))) { // This compaction output could potentially conflict with the output // of a currently running compaction, we cannot run it. *manual_conflict = true; @@ -870,12 +934,11 @@ Compaction* CompactionPicker::CompactRange( GetCompressionType(vstorage, mutable_cf_options, output_level, vstorage->base_level()), GetCompressionOptions(mutable_cf_options, vstorage, output_level), - mutable_cf_options.default_write_temperature, - compact_range_options.max_subcompactions, std::move(grandparents), + Temperature::kUnknown, compact_range_options.max_subcompactions, + std::move(grandparents), /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr, - /* is manual */ true, trim_ts, /* score */ -1, - /* deletion_compaction */ false, /* l0_files_might_overlap */ true, - CompactionReason::kUnknown, + CompactionReason::kManualCompaction, trim_ts, /* score */ -1, + /* l0_files_might_overlap */ true, compact_range_options.blob_garbage_collection_policy, compact_range_options.blob_garbage_collection_age_cutoff); @@ -886,7 +949,8 @@ Compaction* CompactionPicker::CompactRange( // takes running compactions into account (by skipping files that are already // being compacted). Since we just changed compaction score, we recalculate it // here - vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options, + full_history_ts_low); return compaction; } @@ -1137,7 +1201,7 @@ Status CompactionPicker::SanitizeAndConvertCompactionInputFiles( if (output_level != 0 && FilesRangeOverlapWithCompaction( *converted_input_files, output_level, - Compaction::EvaluatePenultimateLevel( + Compaction::EvaluateProximalLevel( version->storage_info(), version->GetMutableCFOptions(), ioptions_, (*converted_input_files)[0].level, output_level))) { return Status::Aborted( @@ -1154,7 +1218,7 @@ void CompactionPicker::RegisterCompaction(Compaction* c) { assert(ioptions_.compaction_style != kCompactionStyleLevel || c->output_level() == 0 || !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(), - c->GetPenultimateLevel())); + c->GetProximalLevel())); // CompactionReason::kExternalSstIngestion's start level is just a placeholder // number without actual meaning as file ingestion technically does not have // an input level like other compactions @@ -1231,7 +1295,7 @@ void CompactionPicker::PickFilesMarkedForCompaction( bool CompactionPicker::GetOverlappingL0Files( VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs, - int output_level, int* parent_index) { + int output_level, int* parent_index, const FileMetaData* starting_l0_file) { // Two level 0 compaction won't run at the same time, so don't need to worry // about files on level 0 being compacted. assert(level0_compactions_in_progress()->empty()); @@ -1242,7 +1306,11 @@ bool CompactionPicker::GetOverlappingL0Files( // which will include the picked file. start_level_inputs->files.clear(); vstorage->GetOverlappingInputs(0, &smallest, &largest, - &(start_level_inputs->files)); + &(start_level_inputs->files), + /*hint_index=*/-1, + /*file_index=*/nullptr, + /*expand_range=*/true, + /*starting_l0_file=*/starting_l0_file); // If we include more L0 files in the same compaction run it can // cause the 'smallest' and 'largest' key to get extended to a diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h index 6285e054301e..bb9b22456e50 100644 --- a/db/compaction/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -65,7 +65,8 @@ class CompactionPicker { const MutableDBOptions& mutable_db_options, const std::vector& existing_snapshots, const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage, - LogBuffer* log_buffer) = 0; + LogBuffer* log_buffer, const std::string& full_history_ts_low, + bool require_max_output_level = false) = 0; // The returned Compaction might not include the whole requested range. // In that case, compaction_end will be set to the next key that needs @@ -75,14 +76,15 @@ class CompactionPicker { // *compaction_end should point to valid InternalKey! // REQUIRES: If not compacting all levels (input_level == kCompactAllLevels), // then levels between input_level and output_level should be empty. - virtual Compaction* CompactRange( + virtual Compaction* PickCompactionForCompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, - uint64_t max_file_num_to_ignore, const std::string& trim_ts); + uint64_t max_file_num_to_ignore, const std::string& trim_ts, + const std::string& full_history_ts_low); // The maximum allowed output level. Default value is NumberLevels() - 1. virtual int MaxOutputLevel() const { return NumberLevels() - 1; } @@ -117,12 +119,17 @@ class CompactionPicker { // Caller must provide a set of input files that has been passed through // `SanitizeAndConvertCompactionInputFiles` earlier. The lock should not be // released between that call and this one. - Compaction* CompactFiles(const CompactionOptions& compact_options, - const std::vector& input_files, - int output_level, VersionStorageInfo* vstorage, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - uint32_t output_path_id); + // + // TODO - Remove default values for earliest_snapshot and snapshot_checker + // and require all callers to pass them in so that DB::CompactFiles() can + // also benefit from Standalone Range Tombstone Optimization + Compaction* PickCompactionForCompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_files, int output_level, + VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, uint32_t output_path_id, + std::optional earliest_snapshot = std::nullopt, + const SnapshotChecker* snapshot_checker = nullptr); // Converts a set of compaction input file numbers into // a list of CompactionInputFiles. @@ -138,6 +145,12 @@ class CompactionPicker { return !level0_compactions_in_progress_.empty(); } + // Is any compaction in progress + bool IsCompactionInProgress() const { + return !(level0_compactions_in_progress_.empty() && + compactions_in_progress_.empty()); + } + // Return true if the passed key range overlap with a compaction output // that is currently running. bool RangeOverlapWithCompaction(const Slice& smallest_user_key, @@ -190,15 +203,18 @@ class CompactionPicker { // key range of a currently running compaction. bool FilesRangeOverlapWithCompaction( const std::vector& inputs, int level, - int penultimate_level) const; + int proximal_level) const; + // @param starting_l0_file If not null, restricts L0 file selection to only + // include files at or older than starting_l0_file. bool SetupOtherInputs(const std::string& cf_name, const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, CompactionInputFiles* inputs, CompactionInputFiles* output_level_inputs, int* parent_index, int base_index, - bool only_expand_towards_right = false); + bool only_expand_towards_right = false, + const FileMetaData* starting_l0_file = nullptr); void GetGrandparents(VersionStorageInfo* vstorage, const CompactionInputFiles& inputs, @@ -211,9 +227,12 @@ class CompactionPicker { CompactionInputFiles* start_level_inputs, std::function skip_marked_file); + // @param starting_l0_file If not null, restricts L0 file selection to only + // include files at or older than starting_l0_file. bool GetOverlappingL0Files(VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs, - int output_level, int* parent_index); + int output_level, int* parent_index, + const FileMetaData* starting_l0_file = nullptr); // Register this compaction in the set of running compactions void RegisterCompaction(Compaction* c); @@ -266,23 +285,24 @@ class NullCompactionPicker : public CompactionPicker { const MutableDBOptions& /*mutable_db_options*/, const std::vector& /*existing_snapshots*/, const SnapshotChecker* /*snapshot_checker*/, - VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */) override { + VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, + const std::string& /*full_history_ts_low*/, + bool /*require_max_output_level*/) override { return nullptr; } // Always return "nullptr" - Compaction* CompactRange(const std::string& /*cf_name*/, - const MutableCFOptions& /*mutable_cf_options*/, - const MutableDBOptions& /*mutable_db_options*/, - VersionStorageInfo* /*vstorage*/, - int /*input_level*/, int /*output_level*/, - const CompactRangeOptions& /*compact_range_options*/, - const InternalKey* /*begin*/, - const InternalKey* /*end*/, - InternalKey** /*compaction_end*/, - bool* /*manual_conflict*/, - uint64_t /*max_file_num_to_ignore*/, - const std::string& /*trim_ts*/) override { + Compaction* PickCompactionForCompactRange( + const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, int /*input_level*/, + int /*output_level*/, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, const InternalKey* /*end*/, + InternalKey** /*compaction_end*/, bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/, + const std::string& /*full_history_ts_low*/) override { return nullptr; } @@ -308,11 +328,10 @@ class NullCompactionPicker : public CompactionPicker { // files. Cannot be nullptr. // // @return true iff compaction was found. -bool FindIntraL0Compaction(const std::vector& level_files, - size_t min_files_to_compact, - uint64_t max_compact_bytes_per_del_file, - uint64_t max_compaction_bytes, - CompactionInputFiles* comp_inputs); +bool PickCostBasedIntraL0Compaction( + const std::vector& level_files, size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs); CompressionType GetCompressionType(const VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index d5c735194004..e13c333856d2 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -9,6 +9,7 @@ #include "db/compaction/compaction_picker_fifo.h" +#include #include #include #include @@ -31,6 +32,29 @@ uint64_t GetTotalFilesSize(const std::vector& files) { } return total_size; } + +// Compute effective data size and capacity limit for FIFO compaction. +// When max_data_files_size > 0 (blob-aware mode), the effective size includes +// both SST and blob file sizes, and the limit is max_data_files_size. +// Otherwise, only SST sizes are used with max_table_files_size as the limit. +void GetEffectiveSizeAndLimit(const CompactionOptionsFIFO& fifo_opts, + uint64_t total_sst_size, uint64_t total_blob_size, + uint64_t* effective_size, + uint64_t* effective_max) { + *effective_size = total_sst_size; + *effective_max = fifo_opts.max_table_files_size; + if (fifo_opts.max_data_files_size > 0) { + *effective_size += total_blob_size; + *effective_max = fifo_opts.max_data_files_size; + } +} + +// Return the effective capacity limit for FIFO compaction. +// Convenience wrapper when only the limit is needed (e.g., PickTTLCompaction). +uint64_t GetEffectiveMax(const CompactionOptionsFIFO& fifo_opts) { + return fifo_opts.max_data_files_size > 0 ? fifo_opts.max_data_files_size + : fifo_opts.max_table_files_size; +} } // anonymous namespace bool FIFOCompactionPicker::NeedsCompaction( @@ -98,10 +122,43 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction( // Return a nullptr and proceed to size-based FIFO compaction if: // 1. there are no files older than ttl OR // 2. there are a few files older than ttl, but deleting them will not bring - // the total size to be less than max_table_files_size threshold. - if (inputs[0].files.empty() || - total_size > - mutable_cf_options.compaction_options_fifo.max_table_files_size) { + // the total size to be less than the size threshold. + uint64_t effective_max = + GetEffectiveMax(mutable_cf_options.compaction_options_fifo); + // Estimate the effective remaining data after dropping TTL-expired SSTs. + // Each dropped SST also frees a proportional share of blob data. + // + // In multi-level FIFO (migration), we must use total SST across ALL levels + // as the reference, because total_blob covers all levels. Using only L0 + // SST would inflate the blob estimate. + uint64_t effective_remaining = total_size; + if (mutable_cf_options.compaction_options_fifo.max_data_files_size > 0) { + uint64_t total_blob = vstorage->GetBlobStats().total_file_size; + // Compute total SST across all levels so the reference scope matches + // total_blob's scope (all levels). + uint64_t total_sst_all_levels = GetTotalFilesSize(level_files); + for (int level = 1; level < vstorage->num_levels(); ++level) { + total_sst_all_levels += GetTotalFilesSize(vstorage->LevelFiles(level)); + } + // remaining_sst_all = total_sst_all - dropped_l0_sst + // total_size is the remaining L0 SST after removing expired files; + // original L0 SST minus remaining L0 SST = dropped. + uint64_t original_l0_sst = GetTotalFilesSize(level_files); + uint64_t dropped_sst = original_l0_sst - total_size; + uint64_t remaining_sst_all = total_sst_all_levels - dropped_sst; + // Proportional blob estimate: each SST byte "owns" a proportional + // share of blob bytes. Both reference sizes must come from the same + // scope (all levels) to avoid inflated estimates. + if (total_sst_all_levels > 0 && total_blob > 0) { + effective_remaining = + remaining_sst_all + + static_cast(static_cast(remaining_sst_all) / + total_sst_all_levels * total_blob); + } else { + effective_remaining = remaining_sst_all; + } + } + if (inputs[0].files.empty() || effective_remaining > effective_max) { return nullptr; } @@ -124,14 +181,11 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction( Compaction* c = new Compaction( vstorage, ioptions_, mutable_cf_options, mutable_db_options, std::move(inputs), 0, 0, 0, 0, kNoCompression, - mutable_cf_options.compression_opts, - mutable_cf_options.default_write_temperature, + mutable_cf_options.compression_opts, Temperature::kUnknown, /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt, - /* snapshot_checker */ nullptr, - /* is manual */ false, + /* snapshot_checker */ nullptr, CompactionReason::kFIFOTtl, /* trim_ts */ "", vstorage->CompactionScore(0), - /* is deletion compaction */ true, /* l0_files_might_overlap */ true, - CompactionReason::kFIFOTtl); + /* l0_files_might_overlap */ true); return c; } @@ -154,7 +208,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, LogBuffer* log_buffer) { - // compute the total size and identify the last non-empty level + const auto& fifo_opts = mutable_cf_options.compaction_options_fifo; + + // compute the total SST size and identify the last non-empty level int last_level = 0; uint64_t total_size = 0; for (int level = 0; level < vstorage->num_levels(); ++level) { @@ -167,54 +223,13 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( const std::vector& last_level_files = vstorage->LevelFiles(last_level); - if (last_level == 0 && - total_size <= - mutable_cf_options.compaction_options_fifo.max_table_files_size) { - // total size not exceeded, try to find intra level 0 compaction if enabled - const std::vector& level0_files = vstorage->LevelFiles(0); - if (mutable_cf_options.compaction_options_fifo.allow_compaction && - level0_files.size() > 0) { - CompactionInputFiles comp_inputs; - // try to prevent same files from being compacted multiple times, which - // could produce large files that may never TTL-expire. Achieve this by - // disallowing compactions with files larger than memtable (inflate its - // size by 10% to account for uncompressed L0 files that may have size - // slightly greater than memtable size limit). - size_t max_compact_bytes_per_del_file = - static_cast(MultiplyCheckOverflow( - static_cast(mutable_cf_options.write_buffer_size), - 1.1)); - if (FindIntraL0Compaction( - level0_files, - mutable_cf_options - .level0_file_num_compaction_trigger /* min_files_to_compact */ - , - max_compact_bytes_per_del_file, - mutable_cf_options.max_compaction_bytes, &comp_inputs)) { - Compaction* c = new Compaction( - vstorage, ioptions_, mutable_cf_options, mutable_db_options, - {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, - 0 /* max compaction bytes, not applicable */, - 0 /* output path ID */, mutable_cf_options.compression, - mutable_cf_options.compression_opts, - mutable_cf_options.default_write_temperature, - 0 /* max_subcompactions */, {}, - /* earliest_snapshot */ std::nullopt, - /* snapshot_checker */ nullptr, /* is manual */ false, - /* trim_ts */ "", vstorage->CompactionScore(0), - /* is deletion compaction */ false, - /* l0_files_might_overlap */ true, - CompactionReason::kFIFOReduceNumFiles); - return c; - } - } + // Compute effective size and limit for comparison. + uint64_t effective_size, effective_max; + GetEffectiveSizeAndLimit(fifo_opts, total_size, + vstorage->GetBlobStats().total_file_size, + &effective_size, &effective_max); - ROCKS_LOG_BUFFER( - log_buffer, - "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 - ", max size %" PRIu64 "\n", - cf_name.c_str(), total_size, - mutable_cf_options.compaction_options_fifo.max_table_files_size); + if (last_level == 0 && effective_size <= effective_max) { return nullptr; } @@ -232,11 +247,29 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( inputs[0].level = last_level; if (last_level == 0) { + // When using blob-aware sizing, use proportional estimation (same + // principle as EstimateTotalDataForSST): each SST "owns" + // effective_size / num_files of total data. This is an approximation + // — individual SSTs may reference different amounts of blob data, + // but uniform distribution is a reasonable estimate for FIFO dropping. + uint64_t remaining_size = effective_size; + const uint64_t num_files = last_level_files.size(); + // Proportional estimate of data per file (SST + blob). + // Use max(1) to prevent stalling when effective_size < num_files. + const uint64_t data_per_file = + (fifo_opts.max_data_files_size > 0 && num_files > 0) + ? std::max(effective_size / num_files, uint64_t{1}) + : 0; + // In L0, right-most files are the oldest files. for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend(); ++ritr) { auto f = *ritr; - total_size -= f->fd.file_size; + if (fifo_opts.max_data_files_size > 0) { + remaining_size -= std::min(remaining_size, data_per_file); + } else { + remaining_size -= std::min(remaining_size, f->fd.file_size); + } inputs[0].files.push_back(f); char tmp_fsize[16]; AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); @@ -244,13 +277,11 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( "[%s] FIFO compaction: picking file %" PRIu64 " with size %s for deletion", cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); - if (total_size <= - mutable_cf_options.compaction_options_fifo.max_table_files_size) { + if (remaining_size <= effective_max) { break; } } - } else if (total_size > - mutable_cf_options.compaction_options_fifo.max_table_files_size) { + } else if (effective_size > effective_max) { // If the last level is non-L0, we actually don't know which file is // logically the oldest since the file creation time only represents // when this file was compacted to this level, which is independent @@ -260,31 +291,36 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( // file with the smallest key will be deleted first. This design decision // better serves a major type of FIFO use cases where smaller keys are // associated with older data. + const uint64_t num_files = last_level_files.size(); + // Proportional estimate of data per file (SST + blob), same as L0 path. + const uint64_t data_per_file = + (fifo_opts.max_data_files_size > 0 && num_files > 0) + ? std::max(effective_size / num_files, uint64_t{1}) + : 0; for (const auto& f : last_level_files) { - total_size -= f->fd.file_size; + if (f->being_compacted) { + continue; + } + if (fifo_opts.max_data_files_size > 0) { + effective_size -= std::min(effective_size, data_per_file); + } else { + effective_size -= std::min(effective_size, f->fd.file_size); + } inputs[0].files.push_back(f); char tmp_fsize[16]; AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); - ROCKS_LOG_BUFFER( - log_buffer, - "[%s] FIFO compaction: picking file %" PRIu64 - " with size %s for deletion under total size %" PRIu64 - " vs max table files size %" PRIu64, - cf_name.c_str(), f->fd.GetNumber(), tmp_fsize, total_size, - mutable_cf_options.compaction_options_fifo.max_table_files_size); - - if (total_size <= - mutable_cf_options.compaction_options_fifo.max_table_files_size) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion under total size %" PRIu64 + " vs max size %" PRIu64, + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize, + effective_size, effective_max); + + if (effective_size <= effective_max) { break; } } } else { - ROCKS_LOG_BUFFER( - log_buffer, - "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 - ", max size %" PRIu64 "\n", - cf_name.c_str(), total_size, - mutable_cf_options.compaction_options_fifo.max_table_files_size); return nullptr; } @@ -294,14 +330,11 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( /* target_file_size */ 0, /* max_compaction_bytes */ 0, /* output_path_id */ 0, kNoCompression, - mutable_cf_options.compression_opts, - mutable_cf_options.default_write_temperature, + mutable_cf_options.compression_opts, Temperature::kUnknown, /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt, - /* snapshot_checker */ nullptr, - /* is manual */ false, + /* snapshot_checker */ nullptr, CompactionReason::kFIFOMaxSize, /* trim_ts */ "", vstorage->CompactionScore(0), - /* is deletion compaction */ true, - /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize); + /* l0_files_might_overlap */ true); return c; } @@ -392,12 +425,14 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction( assert(compaction_target_temp == Temperature::kLastTemperature); compaction_target_temp = cur_target_temp; inputs[0].files.push_back(cur_file); - ROCKS_LOG_BUFFER( - log_buffer, - "[%s] FIFO compaction: picking file %" PRIu64 - " with estimated newest key time %" PRIu64 " for temperature %s.", - cf_name.c_str(), cur_file->fd.GetNumber(), est_newest_key_time, - temperature_to_string[cur_target_temp].c_str()); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with estimated newest key time %" PRIu64 + " and temperature %s for temperature %s.", + cf_name.c_str(), cur_file->fd.GetNumber(), + est_newest_key_time, + temperature_to_string[cur_file->temperature].c_str(), + temperature_to_string[cur_target_temp].c_str()); break; } } @@ -416,19 +451,268 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction( mutable_cf_options.compression, mutable_cf_options.compression_opts, compaction_target_temp, /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt, - /* snapshot_checker */ nullptr, - /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), - /* is deletion compaction */ false, /* l0_files_might_overlap */ true, - CompactionReason::kChangeTemperature); + /* snapshot_checker */ nullptr, CompactionReason::kChangeTemperature, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* l0_files_might_overlap */ true); return c; } +Compaction* FIFOCompactionPicker::PickIntraL0Compaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + const auto& fifo_opts = mutable_cf_options.compaction_options_fifo; + + if (!fifo_opts.allow_compaction) { + return nullptr; + } + + const std::vector& level0_files = vstorage->LevelFiles(0); + if (level0_files.empty()) { + return nullptr; + } + + if (fifo_opts.use_kv_ratio_compaction) { + return PickRatioBasedIntraL0Compaction( + cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer); + } + + // Old intra-L0 path: merge small files using PickCostBasedIntraL0Compaction. + // Minimum files to compact follows level0_file_num_compaction_trigger. + // Try to prevent same files from being compacted multiple times, which + // could produce large files that may never TTL-expire. Achieve this by + // disallowing compactions with files larger than memtable (inflate its + // size by 10% to account for uncompressed L0 files that may have size + // slightly greater than memtable size limit). + + CompactionInputFiles comp_inputs; + size_t max_compact_bytes_per_del_file = + static_cast(MultiplyCheckOverflow( + static_cast(mutable_cf_options.write_buffer_size), 1.1)); + if (PickCostBasedIntraL0Compaction( + level0_files, + mutable_cf_options + .level0_file_num_compaction_trigger /* min_files_to_compact */, + max_compact_bytes_per_del_file, + mutable_cf_options.max_compaction_bytes, &comp_inputs)) { + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, + mutable_cf_options.compression, mutable_cf_options.compression_opts, + Temperature::kUnknown, 0 /* max_subcompactions */, {}, + /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, CompactionReason::kFIFOReduceNumFiles, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* l0_files_might_overlap */ true); + return c; + } + + return nullptr; +} + +Compaction* FIFOCompactionPicker::PickRatioBasedIntraL0Compaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + const auto& fifo_opts = mutable_cf_options.compaction_options_fifo; + assert(fifo_opts.use_kv_ratio_compaction); + assert(fifo_opts.max_data_files_size > 0); + + // During migration from level/universal compaction to FIFO, non-L0 levels + // may still contain files. The ratio-based algorithm only operates on L0, + // so skip it until PickSizeCompaction has drained all non-L0 levels. + // Once levels collapse to L0-only, this algorithm will kick in. + for (int level = 1; level < vstorage->num_levels(); ++level) { + if (!vstorage->LevelFiles(level).empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO kv-ratio compaction: skipping — non-L0 " + "level %d still has %" ROCKSDB_PRIszt + " files (migration in progress)", + cf_name.c_str(), level, + vstorage->LevelFiles(level).size()); + return nullptr; + } + } + + if (!level0_compactions_in_progress_.empty()) { + return nullptr; + } + + const std::vector& level0_files = vstorage->LevelFiles(0); + if (mutable_cf_options.level0_file_num_compaction_trigger <= 1) { + // trigger <= 0 is invalid; trigger == 1 means compact after every flush, + // which doesn't make sense for tiered merging (the tier boundary loop + // divides by trigger, so trigger == 1 would cause an infinite loop). + return nullptr; + } + const size_t trigger = static_cast( + mutable_cf_options.level0_file_num_compaction_trigger); + if (level0_files.size() < trigger) { + return nullptr; + } + + // Determine the target compacted file size. + // + // When max_compaction_bytes > 0 (explicitly set by user), use it directly + // as the target. This allows users to override the auto-calculated value. + // + // When max_compaction_bytes == 0 (default), auto-calculate from the data + // capacity and observed SST/blob ratio: + // target = max_data_files_size * sst_ratio / trigger + // + // This is recomputed on every PickCompaction call. The computation is + // trivial (sum file sizes + arithmetic) and PickCompaction is only called + // once per flush or compaction completion, so no caching is needed. + uint64_t target = 0; + if (mutable_cf_options.max_compaction_bytes > 0) { + // User explicitly set max_compaction_bytes — use it as target + target = mutable_cf_options.max_compaction_bytes; + } else { + // Auto-calculate from capacity and observed SST/blob ratio + uint64_t total_sst = GetTotalFilesSize(level0_files); + uint64_t total_blob = vstorage->GetBlobStats().total_file_size; + uint64_t total_data = total_sst + total_blob; + + if (total_data == 0 || total_sst == 0) { + return nullptr; + } + + // Compute sst_ratio (inverse of EstimateTotalDataForSST's proportion): + // when no blob files exist, sst_ratio is 1.0 and the target becomes + // max_data_files_size / trigger, which is large. The algorithm will + // naturally not find small enough files to compact. + double sst_ratio = + (total_blob > 0) ? static_cast(total_sst) / total_data : 1.0; + + uint64_t total_sst_at_cap = + static_cast(fifo_opts.max_data_files_size * sst_ratio); + target = total_sst_at_cap / trigger; + + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO ratio-based compaction: sst_ratio=%.4f, " + "target_file_size=%" PRIu64, + cf_name.c_str(), sst_ratio, target); + } + if (target == 0) { + return nullptr; + } + + // Tiered size-based file selection. + // + // Tier boundaries form a geometric sequence descending from target: + // ..., target/trigger^2, target/trigger, target + // For each boundary (smallest first), find contiguous L0 files with + // size < boundary. If their accumulated bytes >= boundary, merge them. + // The output (~boundary bytes) advances to the next tier. Files that + // reach target are "graduated" and never compacted again. + // + // Trade-off: write amplification vs L0 file count. + // + // Write amp: O(log(target/flush) / log(trigger)) per byte, instead of + // O(target / (trigger * flush)) from flat merging. Each byte is + // rewritten once per tier crossing. + // + // L0 file count: trigger + k * (trigger - 1) at steady state, where + // k = ceil(log(target/flush) / log(trigger)). This is higher than + // the original trigger target because intermediate tier files + // accumulate while waiting for the next tier merge. The trade-off + // is explicit: more L0 files in exchange for logarithmic (instead + // of linear) write amplification. + + // Build tier boundaries from smallest to largest. + // Stop at 10KB minimum — SST files of most workloads are larger than + // this, so lower boundaries would only waste CPU scanning L0 files. + // Files smaller than the lowest boundary simply merge at that boundary. + static constexpr uint64_t kMinTierBoundary = 10 * 1024; // 10KB + std::vector boundaries; + for (uint64_t b = target; b >= kMinTierBoundary; b /= trigger) { + boundaries.push_back(b); + } + if (boundaries.empty()) { + // target itself is below kMinTierBoundary — use target as the + // sole boundary so we can still compact at the target size. + boundaries.push_back(target); + } + std::reverse(boundaries.begin(), boundaries.end()); + + // For each tier boundary (smallest first), scan L0 for mergeable batches. + // L0 files are stored newest-first; oldest is at the end. + for (const uint64_t boundary : boundaries) { + for (size_t scan = level0_files.size(); scan > 0;) { + // Skip files >= boundary (they belong to higher tiers) or in-progress + if (level0_files[scan - 1]->fd.file_size >= boundary || + level0_files[scan - 1]->being_compacted) { + --scan; + continue; + } + + // Found a file < boundary — collect contiguous batch + std::vector batch; + uint64_t accumulated = 0; + size_t pos = scan; + while (pos > 0 && level0_files[pos - 1]->fd.file_size < boundary && + !level0_files[pos - 1]->being_compacted) { + // Don't let output exceed 2x boundary (prevent tier-skipping) + if (accumulated >= boundary && + accumulated + level0_files[pos - 1]->fd.file_size > boundary * 2) { + break; + } + batch.push_back(level0_files[pos - 1]); + accumulated += level0_files[pos - 1]->fd.file_size; + --pos; + } + + // Viable: >= 2 files and accumulated >= boundary + if (batch.size() >= 2 && accumulated >= boundary) { + CompactionInputFiles comp_inputs; + comp_inputs.level = 0; + comp_inputs.files = std::move(batch); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO kv-ratio compaction: picking %" ROCKSDB_PRIszt + " files (%" PRIu64 " bytes) at tier boundary %" PRIu64 + " for intra-L0 compaction, target=%" PRIu64, + cf_name.c_str(), comp_inputs.files.size(), accumulated, boundary, + target); + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + {comp_inputs}, 0, boundary /* output file size limit */, + 0 /* max compaction bytes, not applicable */, + 0 /* output path ID */, mutable_cf_options.compression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompactions */, {}, + /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, + CompactionReason::kFIFOReduceNumFiles, + /* trim_ts */ "", vstorage->CompactionScore(0), + /* l0_files_might_overlap */ true); + return c; + } + + // This batch wasn't enough — advance past it + scan = pos; + } + } + + return nullptr; +} + +// The full_history_ts_low parameter is used to control bottommost file marking +// for compaction when user-defined timestamps (UDT) are enabled. + +// TODO leverage full_history_ts_low for FIFO compaction, by trigggerring +// compaction early for data that has already expired to achieve the goal of TTL +// enforced compliance. Compaction* FIFOCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, const std::vector& /* existing_snapshots */, const SnapshotChecker* /* snapshot_checker */, VersionStorageInfo* vstorage, - LogBuffer* log_buffer) { + LogBuffer* log_buffer, const std::string& /* full_history_ts_low */, + bool /* require_max_output_level*/) { Compaction* c = nullptr; if (mutable_cf_options.ttl > 0) { c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options, @@ -438,22 +722,35 @@ Compaction* FIFOCompactionPicker::PickCompaction( c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer); } + // Intra-L0 compaction merges small files to reduce file count. + // It runs after size-based dropping: if PickSizeCompaction dropped files, + // it returned non-null and we skip this. Otherwise, we try to reduce + // L0 file count by merging small files together. + if (c == nullptr) { + c = PickIntraL0Compaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } if (c == nullptr) { c = PickTemperatureChangeCompaction( cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer); } + if (c == nullptr) { + ROCKS_LOG_BUFFER(log_buffer, "[%s] FIFO compaction: no compaction picked", + cf_name.c_str()); + } RegisterCompaction(c); return c; } -Compaction* FIFOCompactionPicker::CompactRange( +Compaction* FIFOCompactionPicker::PickCompactionForCompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, int input_level, int output_level, const CompactRangeOptions& /*compact_range_options*/, const InternalKey* /*begin*/, const InternalKey* /*end*/, InternalKey** compaction_end, bool* /*manual_conflict*/, - uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) { + uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/, + const std::string& full_history_ts_low) { #ifdef NDEBUG (void)input_level; (void)output_level; @@ -462,10 +759,10 @@ Compaction* FIFOCompactionPicker::CompactRange( assert(output_level == 0); *compaction_end = nullptr; LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger); - Compaction* c = - PickCompaction(cf_name, mutable_cf_options, mutable_db_options, - /*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr, - vstorage, &log_buffer); + Compaction* c = PickCompaction( + cf_name, mutable_cf_options, mutable_db_options, + /*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr, vstorage, + &log_buffer, full_history_ts_low, /* require_max_output_level */ false); log_buffer.FlushBufferToLog(); return c; } diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h index 4dd1053e127b..2c1cd21321b9 100644 --- a/db/compaction/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -23,18 +23,19 @@ class FIFOCompactionPicker : public CompactionPicker { const MutableDBOptions& mutable_db_options, const std::vector& /* existing_snapshots */, const SnapshotChecker* /* snapshot_checker */, - VersionStorageInfo* version, LogBuffer* log_buffer) override; - - Compaction* CompactRange(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* vstorage, int input_level, - int output_level, - const CompactRangeOptions& compact_range_options, - const InternalKey* begin, const InternalKey* end, - InternalKey** compaction_end, bool* manual_conflict, - uint64_t max_file_num_to_ignore, - const std::string& trim_ts) override; + VersionStorageInfo* version, LogBuffer* log_buffer, + const std::string& /* full_history_ts_low */, + bool /* require_max_output_level*/ = false) override; + + Compaction* PickCompactionForCompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore, const std::string& trim_ts, + const std::string& full_history_ts_low) override; // The maximum allowed output level. Always returns 0. int MaxOutputLevel() const override { return 0; } @@ -54,6 +55,28 @@ class FIFOCompactionPicker : public CompactionPicker { VersionStorageInfo* version, LogBuffer* log_buffer); + // Intra-L0 compaction: merges small L0 files to reduce file count. + // Dispatches between two strategies based on configuration: + // - use_kv_ratio_compaction = true: PickRatioBasedIntraL0Compaction + // (BlobDB-optimized) + // - use_kv_ratio_compaction = false: PickCostBasedIntraL0Compaction + // (original) + // Only active when allow_compaction = true. + Compaction* PickIntraL0Compaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer); + + // Capacity-derived intra-L0 compaction for BlobDB workloads. + // Uses the observed SST/blob ratio to compute a target file size, + // producing uniform files for predictable FIFO trimming. + // Called from PickIntraL0Compaction when use_kv_ratio_compaction = true. + Compaction* PickRatioBasedIntraL0Compaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer); + // Will pick one file to compact at a time, starting from the oldest file. Compaction* PickTemperatureChangeCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index 612c1e5af21a..ade42ce5e3e8 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -61,14 +61,16 @@ class LevelCompactionBuilder { LogBuffer* log_buffer, const MutableCFOptions& mutable_cf_options, const ImmutableOptions& ioptions, - const MutableDBOptions& mutable_db_options) + const MutableDBOptions& mutable_db_options, + const std::string& full_history_ts_low) : cf_name_(cf_name), vstorage_(vstorage), compaction_picker_(compaction_picker), log_buffer_(log_buffer), mutable_cf_options_(mutable_cf_options), ioptions_(ioptions), - mutable_db_options_(mutable_db_options) {} + mutable_db_options_(mutable_db_options), + full_history_ts_low_(full_history_ts_low) {} // Pick and return a compaction. Compaction* PickCompaction(); @@ -145,7 +147,6 @@ class LevelCompactionBuilder { int parent_index_ = -1; int base_index_ = -1; double start_level_score_ = 0; - bool is_manual_ = false; bool is_l0_trivial_move_ = false; CompactionInputFiles start_level_inputs_; std::vector compaction_inputs_; @@ -156,6 +157,7 @@ class LevelCompactionBuilder { const MutableCFOptions& mutable_cf_options_; const ImmutableOptions& ioptions_; const MutableDBOptions& mutable_db_options_; + const std::string& full_history_ts_low_; // Pick a path ID to place a newly generated file, with its level static uint32_t GetPathId(const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options, @@ -414,9 +416,9 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() { &tmp_start_level_inputs) || compaction_picker_->FilesRangeOverlapWithCompaction( {tmp_start_level_inputs}, output_level_, - Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_, - ioptions_, start_level_, - output_level_))) { + Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_, + ioptions_, start_level_, + output_level_))) { // Constraint 1a tmp_start_level_inputs.clear(); return; @@ -490,9 +492,9 @@ bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { // We need to disallow this from happening. if (compaction_picker_->FilesRangeOverlapWithCompaction( compaction_inputs_, output_level_, - Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_, - ioptions_, start_level_, - output_level_))) { + Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_, + ioptions_, start_level_, + output_level_))) { // This compaction output could potentially conflict with the output // of a currently running compaction, we cannot run it. return false; @@ -558,12 +560,11 @@ Compaction* LevelCompactionBuilder::GetCompaction() { GetCompressionType(vstorage_, mutable_cf_options_, output_level_, vstorage_->base_level()), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_), - mutable_cf_options_.default_write_temperature, + Temperature::kUnknown, /* max_subcompactions */ 0, std::move(grandparents_), /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr, - is_manual_, - /* trim_ts */ "", start_level_score_, false /* deletion_compaction */, - l0_files_might_overlap, compaction_reason_); + compaction_reason_, + /* trim_ts */ "", start_level_score_, l0_files_might_overlap); // If it's level 0 compaction, make sure we don't execute any other level 0 // compactions in parallel @@ -573,7 +574,8 @@ Compaction* LevelCompactionBuilder::GetCompaction() { // takes running compactions into account (by skipping files that are already // being compacted). Since we just changed compaction score, we recalculate it // here - vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_, + full_history_ts_low_); return c; } @@ -846,9 +848,9 @@ bool LevelCompactionBuilder::PickFileToCompact() { &start_level_inputs_) || compaction_picker_->FilesRangeOverlapWithCompaction( {start_level_inputs_}, output_level_, - Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_, - ioptions_, start_level_, - output_level_))) { + Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_, + ioptions_, start_level_, + output_level_))) { // A locked (pending compaction) input-level file was pulled in due to // user-key overlap. start_level_inputs_.clear(); @@ -912,10 +914,10 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() { // resort to L0->L0 compaction yet. return false; } - return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, - std::numeric_limits::max(), - mutable_cf_options_.max_compaction_bytes, - &start_level_inputs_); + return PickCostBasedIntraL0Compaction( + level_files, kMinFilesForIntraL0Compaction, + std::numeric_limits::max(), + mutable_cf_options_.max_compaction_bytes, &start_level_inputs_); } bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() { @@ -978,10 +980,11 @@ Compaction* LevelCompactionPicker::PickCompaction( const MutableDBOptions& mutable_db_options, const std::vector& /*existing_snapshots */, const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage, - LogBuffer* log_buffer) { + LogBuffer* log_buffer, const std::string& full_history_ts_low, + bool /* require_max_output_level*/) { LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, mutable_cf_options, ioptions_, - mutable_db_options); + mutable_db_options, full_history_ts_low); return builder.PickCompaction(); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h index 9cb41dfb64f8..e86c821aa309 100644 --- a/db/compaction/compaction_picker_level.h +++ b/db/compaction/compaction_picker_level.h @@ -25,7 +25,9 @@ class LevelCompactionPicker : public CompactionPicker { const MutableDBOptions& mutable_db_options, const std::vector& /* existing_snapshots */, const SnapshotChecker* /* snapshot_checker */, - VersionStorageInfo* vstorage, LogBuffer* log_buffer) override; + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + const std::string& full_history_ts_low, + bool /*require_max_output_level*/ = false) override; bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; }; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index f48195e29a0b..4dfa327ae162 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -7,6 +7,8 @@ #include #include +#include "db/blob/blob_file_meta.h" +#include "db/column_family.h" #include "db/compaction/compaction.h" #include "db/compaction/compaction_picker_fifo.h" #include "db/compaction/compaction_picker_level.h" @@ -17,6 +19,7 @@ #include "table/unique_id_impl.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -160,11 +163,19 @@ class CompactionPickerTestBase : public testing::Test { kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, - true /* user_defined_timestamps_persisted */); + true /* user_defined_timestamps_persisted */, "" /* min timestamp */, + "" /* max timestamp */); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; // oldest_ancester_time is only used if newest_key_time is not available f->oldest_ancester_time = oldest_ancestor_time; + // Set min/max timestamps for UDT support + if (!ts_of_smallest.empty()) { + f->min_timestamp = ts_of_smallest.ToString(); + } + if (!ts_of_largest.empty()) { + f->max_timestamp = ts_of_largest.ToString(); + } TableProperties tp; tp.newest_key_time = newest_key_time; f->fd.table_reader = new mock::MockTableReader(mock::KVVector{}, tp); @@ -195,6 +206,11 @@ class CompactionPickerTestBase : public testing::Test { } void UpdateVersionStorageInfo() { + UpdateVersionStorageInfoWithTsLow(/*full_history_ts_low=*/""); + } + + void UpdateVersionStorageInfoWithTsLow( + const std::string& full_history_ts_low) { if (temp_vstorage_) { VersionBuilder builder(FileOptions(), &ioptions_, nullptr, vstorage_.get(), nullptr); @@ -202,10 +218,51 @@ class CompactionPickerTestBase : public testing::Test { vstorage_ = std::move(temp_vstorage_); } vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_); - vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_, + full_history_ts_low); vstorage_->SetFinalized(); } + void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_bytes, + BlobFileMetaData::LinkedSsts linked_ssts = {}) { + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, /*total_blob_count=*/1, total_blob_bytes, + /*checksum_method=*/"", /*checksum_value=*/""); + auto meta = + BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts), + /*garbage_blob_count=*/0, + /*garbage_blob_bytes=*/0); + vstorage_->AddBlobFile(std::move(meta)); + } + + // Helper to set up FIFO ratio-based compaction options and version storage. + // Call before Add()/AddBlobFile(), then create FIFOCompactionPicker after. + void SetupFIFORatioBased(uint64_t max_table_files_size, + uint64_t max_data_files_size, int trigger, + bool allow_compaction = true, + bool use_kv_ratio = true, int num_levels = 1) { + ioptions_.compaction_style = kCompactionStyleFIFO; + NewVersionStorage(num_levels, kCompactionStyleFIFO); + mutable_cf_options_.compaction_options_fifo.max_table_files_size = + max_table_files_size; + mutable_cf_options_.compaction_options_fifo.max_data_files_size = + max_data_files_size; + mutable_cf_options_.compaction_options_fifo.allow_compaction = + allow_compaction; + mutable_cf_options_.compaction_options_fifo.use_kv_ratio_compaction = + use_kv_ratio; + mutable_cf_options_.level0_file_num_compaction_trigger = trigger; + } + + // Helper to finalize version storage and pick a FIFO compaction. + std::unique_ptr PickFIFOCompaction(FIFOCompactionPicker& picker) { + UpdateVersionStorageInfo(); + return std::unique_ptr(picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr, + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); + } + private: Options CreateOptions(const Comparator* ucmp) const { Options opts; @@ -242,6 +299,60 @@ class CompactionPickerU64TsTest : public CompactionPickerTestBase { : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {} ~CompactionPickerU64TsTest() override = default; + + protected: + // Helper to create a U64 timestamp string from a uint64_t value + static std::string MakeU64Timestamp(uint64_t ts) { + std::string result; + PutFixed64(&result, ts); + return result; + } + + // Helper to add a bottommost file with timestamps and setup version storage + // for testing bottommost file marking behavior + void SetupBottommostFileWithTimestamps(uint64_t min_ts, uint64_t max_ts, + uint64_t full_history_ts_low_val, + SequenceNumber oldest_snapshot_seqnum, + std::string* out_full_history_ts_low) { + std::string ts_small = MakeU64Timestamp(min_ts); + std::string ts_large = MakeU64Timestamp(max_ts); + + Add(5, 1U, "100", "200", /*file_size=*/1000, /*path_id=*/0, + /*smallest_seq=*/10, /*largest_seq=*/40, + /*compensated_file_size=*/1000, + /*marked_for_compact=*/false, Temperature::kUnknown, + kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts_small, ts_large); + + std::string full_history_ts_low = MakeU64Timestamp(full_history_ts_low_val); + + UpdateVersionStorageInfoWithTsLow(full_history_ts_low); + + vstorage_->UpdateOldestSnapshot(oldest_snapshot_seqnum, + /*allow_ingest_behind=*/false, + /*ucmp=*/ucmp_, full_history_ts_low); + + if (out_full_history_ts_low) { + *out_full_history_ts_low = full_history_ts_low; + } + } + + // Helper to add L0 files with timestamps for compaction trigger tests + void AddL0FilesWithTimestamps(uint64_t ts1_val, uint64_t ts2_val, + uint64_t file_size = 1U) { + std::string ts1 = MakeU64Timestamp(ts1_val); + std::string ts2 = MakeU64Timestamp(ts2_val); + + Add(0, 1U, "100", "200", file_size, /*path_id=*/0, + /*smallest_seq=*/100, /*largest_seq=*/100, + /*compensated_file_size=*/file_size, + /*marked_for_compact=*/false, Temperature::kUnknown, + kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts1, ts2); + Add(0, 2U, "150", "250", file_size, /*path_id=*/0, + /*smallest_seq=*/200, /*largest_seq=*/200, + /*compensated_file_size=*/file_size, + /*marked_for_compact=*/false, Temperature::kUnknown, + kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts1, ts2); + } }; TEST_F(CompactionPickerTest, Empty) { @@ -250,7 +361,7 @@ TEST_F(CompactionPickerTest, Empty) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() == nullptr); } @@ -263,7 +374,7 @@ TEST_F(CompactionPickerTest, Single) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() == nullptr); } @@ -278,7 +389,7 @@ TEST_F(CompactionPickerTest, Level0Trigger) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -293,7 +404,7 @@ TEST_F(CompactionPickerTest, Level1Trigger) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); @@ -313,7 +424,7 @@ TEST_F(CompactionPickerTest, Level1Trigger2) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(2U, compaction->num_input_files(1)); @@ -346,7 +457,7 @@ TEST_F(CompactionPickerTest, LevelMaxScore) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); @@ -395,7 +506,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -421,7 +532,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic2) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -448,7 +559,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic3) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -479,7 +590,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic4) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -513,7 +624,7 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); @@ -544,41 +655,48 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) { } TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) { - const uint64_t kFileSize = 100000; - NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal); - ioptions_.allow_ingest_behind = true; - ioptions_.num_levels = 3; - UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); - UpdateVersionStorageInfo(); - // must return false when there's no files. - ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), - false); + for (bool cf_option : {false, true}) { + SCOPED_TRACE("cf_option = " + std::to_string(cf_option)); + const uint64_t kFileSize = 100000; + NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal); + if (cf_option) { + ioptions_.cf_allow_ingest_behind = true; + } else { + ioptions_.allow_ingest_behind = true; + } + ioptions_.num_levels = 3; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); - NewVersionStorage(3, kCompactionStyleUniversal); + NewVersionStorage(3, kCompactionStyleUniversal); - Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); - Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); - Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); - Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); - Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); - Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); - UpdateVersionStorageInfo(); + UpdateVersionStorageInfo(); - std::unique_ptr compaction( - universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, - /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); - // output level should be the one above the bottom-most - ASSERT_EQ(1, compaction->output_level()); + // output level should be the one above the bottom-most + ASSERT_EQ(1, compaction->output_level()); - // input should not include the reserved level - const std::vector* inputs = compaction->inputs(); - for (const auto& compaction_input : *inputs) { - if (!compaction_input.empty()) { - ASSERT_LT(compaction_input.level, 2); + // input should not include the reserved level + const std::vector* inputs = compaction->inputs(); + for (const auto& compaction_input : *inputs) { + if (!compaction_input.empty()) { + ASSERT_LT(compaction_input.level, 2); + } } } } @@ -613,7 +731,7 @@ TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(!compaction->is_trivial_move()); } @@ -641,7 +759,7 @@ TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction->is_trivial_move()); } @@ -671,7 +789,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); @@ -703,7 +821,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_FALSE(compaction); } @@ -731,7 +849,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_FALSE(compaction); } @@ -763,7 +881,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(!compaction || compaction->start_level() != compaction->output_level()); } @@ -785,7 +903,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(0, compaction->start_level()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -811,7 +929,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->start_level()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -850,7 +968,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(3, compaction->start_level()); @@ -893,7 +1011,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(2, compaction->start_level()); @@ -936,7 +1054,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(2, compaction->start_level()); @@ -985,7 +1103,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(3, compaction->start_level()); @@ -1030,7 +1148,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(3, compaction->start_level()); @@ -1083,7 +1201,7 @@ TEST_F(CompactionPickerTest, universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kUniversalSizeAmplification); @@ -1134,10 +1252,15 @@ TEST_F(CompactionPickerTest, FIFOToCold1) { fifo_options_.max_table_files_size = kMaxSize; fifo_options_.file_temperature_age_thresholds = { {Temperature::kCold, kColdThreshold}}; + fifo_options_.allow_trivial_copy_when_change_temperature = true; + fifo_options_.trivial_copy_buffer_size = 16 * 1024 * 1024; mutable_cf_options_.compaction_options_fifo = fifo_options_; mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; - FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + auto copiedIOptions = ioptions_; + copiedIOptions.compaction_style = kCompactionStyleFIFO; + FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); @@ -1162,11 +1285,11 @@ TEST_F(CompactionPickerTest, FIFOToCold1) { fifo_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); - ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); + ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); } @@ -1186,7 +1309,10 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) { mutable_cf_options_.compaction_options_fifo = fifo_options_; mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 9; - FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + auto copiedIOptions = ioptions_; + copiedIOptions.compaction_style = kCompactionStyleFIFO; + FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); @@ -1228,12 +1354,12 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) { fifo_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); // Compaction picker picks older files first and picks one file at a time. - ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); + ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); } @@ -1253,7 +1379,10 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) { mutable_cf_options_.compaction_options_fifo = fifo_options_; mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; - FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + auto copiedIOptions = ioptions_; + copiedIOptions.compaction_style = kCompactionStyleFIFO; + FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); @@ -1293,12 +1422,12 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) { fifo_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); // Compaction picker picks older files first and picks one file at a time. - ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); + ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); } @@ -1318,7 +1447,10 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) { mutable_cf_options_.compaction_options_fifo = fifo_options_; mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; - FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + auto copiedIOptions = ioptions_; + copiedIOptions.compaction_style = kCompactionStyleFIFO; + FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); @@ -1358,11 +1490,11 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) { fifo_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); - ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); + ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); } @@ -1385,7 +1517,10 @@ TEST_F(CompactionPickerTest, FIFOToHotAndWarm) { mutable_cf_options_.compaction_options_fifo = fifo_options_; mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; - FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + auto copiedIOptions = ioptions_; + copiedIOptions.compaction_style = kCompactionStyleFIFO; + FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); @@ -1435,17 +1570,40 @@ TEST_F(CompactionPickerTest, FIFOToHotAndWarm) { fifo_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); // Compaction picker picks older files first and picks one file at a time. - ASSERT_EQ(compaction->output_temperature(), Temperature::kWarm); + ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kWarm); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); } } +TEST_F(CompactionPickerTest, CompactFilesOutputTemperature) { + NewVersionStorage(6, kCompactionStyleLevel); + auto file_number = 66U; + Add(0, file_number, "150", "200", 1000000000U); + UpdateVersionStorageInfo(); + + std::unordered_set input{file_number}; + std::vector input_files; + ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( + &input_files, &input, vstorage_.get(), CompactionOptions())); + + auto compaction_options = CompactionOptions(); + compaction_options.output_temperature_override = Temperature::kCold; + + std::unique_ptr compaction( + level_compaction_picker.PickCompactionForCompactFiles( + compaction_options, input_files, 1, vstorage_.get(), + mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0)); + + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold); +} + TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { NewVersionStorage(6, kCompactionStyleLevel); ioptions_.compaction_pri = kMinOverlappingRatio; @@ -1469,7 +1627,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Pick file 8 because it overlaps with 0 files on level 3. @@ -1503,7 +1661,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 7 because overlapping ratio is the biggest. @@ -1532,7 +1690,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 8 because overlapping ratio is the biggest. @@ -1561,7 +1719,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 6 because overlapping ratio is the biggest. @@ -1598,7 +1756,7 @@ TEST_F(CompactionPickerTest, CompactionPriRoundRobin) { local_level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); // Since the max bytes for level 2 is 120M, picking one file to compact // makes the post-compaction level size less than 120M, there is exactly one @@ -1639,7 +1797,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) { local_level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); // The maximum compaction bytes is very large in this case so we can igore its @@ -1683,7 +1841,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) { local_level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); // The maximum compaction bytes is only 2500 bytes now. Even though we are @@ -1728,7 +1886,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) { local_level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); // Cannot pick more files since we reach the last file in level 2 @@ -1788,7 +1946,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 8 because overlapping ratio is the biggest. @@ -1817,7 +1975,7 @@ TEST_F(CompactionPickerTest, ParentIndexResetBug) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); } // This test checks ExpandWhileOverlapping() by having overlapping user keys @@ -1836,7 +1994,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -1857,7 +2015,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys2) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -1886,7 +2044,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys3) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -1918,7 +2076,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys4) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1943,7 +2101,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys5) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() == nullptr); } @@ -1966,7 +2124,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys6) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1988,7 +2146,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys7) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_GE(1U, compaction->num_input_files(0)); @@ -2018,7 +2176,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys8) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(3U, compaction->num_input_files(0)); @@ -2052,7 +2210,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys9) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -2094,7 +2252,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys10) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -2134,7 +2292,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys11) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -2242,7 +2400,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() == nullptr); } @@ -2274,7 +2432,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); } @@ -2309,7 +2467,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); } @@ -2611,7 +2769,7 @@ TEST_F(CompactionPickerTest, CompactionLimitWhenAddFileFromInputLevel) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(4U, compaction->num_input_files(0)); @@ -2647,7 +2805,7 @@ TEST_F(CompactionPickerTest, HitCompactionLimitWhenAddFileFromInputLevel) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -2672,13 +2830,14 @@ TEST_F(CompactionPickerTest, CompactRangeMaxCompactionBytes) { bool manual_conflict = false; InternalKey manual_end; InternalKey* manual_end_ptr = &manual_end; - std::unique_ptr compaction(level_compaction_picker.CompactRange( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - /*input_level=*/1, /*output_level=*/2, - /*compact_range_options*/ {}, /*begin=*/nullptr, /*end=*/nullptr, - &manual_end_ptr, &manual_conflict, - /*max_file_num_to_ignore=*/std::numeric_limits::max(), - /*trim_ts=*/"")); + std::unique_ptr compaction( + level_compaction_picker.PickCompactionForCompactRange( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + /*input_level=*/1, /*output_level=*/2, + /*compact_range_options*/ {}, /*begin=*/nullptr, /*end=*/nullptr, + &manual_end_ptr, &manual_conflict, + /*max_file_num_to_ignore=*/std::numeric_limits::max(), + /*trim_ts=*/"", /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(2, compaction->output_level()); @@ -2707,7 +2866,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOn) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); } @@ -2733,7 +2892,7 @@ TEST_F(CompactionPickerTest, L0TrivialMove1) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(2, compaction->num_input_files(0)); @@ -2763,7 +2922,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(1, compaction->num_input_files(0)); @@ -2790,7 +2949,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(4, compaction->num_input_files(0)); @@ -2819,7 +2978,7 @@ TEST_F(CompactionPickerTest, NonL0TrivialMoveExtendBothDirection) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(3, compaction->num_input_files(0)); @@ -2850,7 +3009,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveToEmptyLevel) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(1, compaction->num_input_files(0)); @@ -2879,7 +3038,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); // No trivial move, because partitioning is applied ASSERT_TRUE(!compaction->IsTrivialMove()); @@ -2903,7 +3062,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOff) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_FALSE(compaction->IsTrivialMove()); } @@ -2933,7 +3092,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -2968,7 +3127,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -3002,7 +3161,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -3029,7 +3188,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -3060,7 +3219,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -3095,7 +3254,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -3131,7 +3290,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -3142,7 +3301,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { compaction.reset(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -3153,7 +3312,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { compaction.reset(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() == nullptr); ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); } @@ -3180,7 +3339,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -3212,7 +3371,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(4U, compaction->num_input_files(0)); @@ -3262,7 +3421,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); // Validate that its a compaction to reduce sorted runs @@ -3286,7 +3445,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_FALSE(compaction2); } @@ -3317,7 +3476,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3348,7 +3507,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_FALSE(compaction2); } @@ -3390,7 +3549,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3415,14 +3574,15 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) { ASSERT_EQ(1U, compaction->num_input_files(1)); } - vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_, + /*full_history_ts_low=*/""); // After recomputing the compaction score, only one marked file will remain random_index = 0; std::unique_ptr compaction2( universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_FALSE(compaction2); DeleteVersionStorage(); } @@ -3449,7 +3609,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3487,7 +3647,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3545,7 +3705,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3579,7 +3739,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction2); ASSERT_EQ(3U, compaction->num_input_files(0)); ASSERT_TRUE(file_map_[1].first->being_compacted); @@ -3610,11 +3770,12 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) { bool manual_conflict = false; InternalKey* manual_end = nullptr; std::unique_ptr compaction( - universal_compaction_picker.CompactRange( + universal_compaction_picker.PickCompactionForCompactRange( cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(), nullptr, nullptr, &manual_end, &manual_conflict, - std::numeric_limits::max(), "")); + std::numeric_limits::max(), "", + /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); @@ -3659,7 +3820,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); // Make sure it's a size amp compaction and includes all files ASSERT_EQ(compaction->compaction_reason(), @@ -3677,7 +3838,7 @@ TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) { const uint64_t kFileSize = 100000; const int kNumLevels = 7; const int kLastLevel = kNumLevels - 1; - const int kPenultimateLevel = kLastLevel - 1; + const int kProximalLevel = kLastLevel - 1; ioptions_.compaction_style = kCompactionStyleUniversal; mutable_cf_options_.preclude_last_level_data_seconds = 1000; @@ -3696,20 +3857,20 @@ TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); // Internally, size amp compaction is evaluated before size ratio compaction. // Here to make sure it's size ratio compaction instead of size amp ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kUniversalSizeRatio); - ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1); + ASSERT_EQ(compaction->output_level(), kProximalLevel - 1); ASSERT_EQ(compaction->input_levels(0)->num_files, 2); ASSERT_EQ(compaction->input_levels(5)->num_files, 0); ASSERT_EQ(compaction->input_levels(6)->num_files, 0); } TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) { - // Tiered compaction only support level_num > 2 (otherwise the penultimate + // Tiered compaction only support level_num > 2 (otherwise the proximal // level is going to be level 0, which may make thing more complicated), so // when there's only 2 level, still treating level 1 as the last level for // size amp compaction @@ -3737,7 +3898,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); // size amp compaction is still triggered even preclude_last_level is set ASSERT_EQ(compaction->compaction_reason(), @@ -3753,7 +3914,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) { const uint64_t kFileSize = 100000; const int kNumLevels = 7; const int kLastLevel = kNumLevels - 1; - const int kPenultimateLevel = kLastLevel - 1; + const int kProximalLevel = kLastLevel - 1; ioptions_.compaction_style = kCompactionStyleUniversal; mutable_cf_options_.preclude_last_level_data_seconds = 1000; @@ -3772,13 +3933,13 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); // It's a Size Amp compaction, but doesn't include the last level file and - // output to the penultimate level. + // output to the proximal level. ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kUniversalSizeAmplification); - ASSERT_EQ(compaction->output_level(), kPenultimateLevel); + ASSERT_EQ(compaction->output_level(), kProximalLevel); ASSERT_EQ(compaction->input_levels(0)->num_files, 2); ASSERT_EQ(compaction->input_levels(5)->num_files, 1); ASSERT_EQ(compaction->input_levels(6)->num_files, 0); @@ -3814,9 +3975,10 @@ TEST_F(CompactionPickerU64TsTest, Overlap) { std::vector input_files; ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input, vstorage_.get(), CompactionOptions())); - std::unique_ptr comp1(level_compaction_picker.CompactFiles( - CompactionOptions(), input_files, level, vstorage_.get(), - mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0)); + std::unique_ptr comp1( + level_compaction_picker.PickCompactionForCompactFiles( + CompactionOptions(), input_files, level, vstorage_.get(), + mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0)); { // [600, ts=50000] to [600, ts=50000] is the range to check. @@ -3884,7 +4046,7 @@ TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); assert(compaction); ASSERT_TRUE(!compaction->is_trivial_move()); } @@ -3925,9 +4087,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) { ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(level_compaction_picker.CompactFiles( - comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + level_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); input_set.clear(); input_files.clear(); @@ -3940,7 +4103,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) { ASSERT_EQ(enable_per_key_placement_, level_compaction_picker.FilesRangeOverlapWithCompaction( input_files, 6, - Compaction::EvaluatePenultimateLevel( + Compaction::EvaluateProximalLevel( vstorage_.get(), mutable_cf_options_, ioptions_, 0, 6))); } @@ -3971,9 +4134,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) { ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(level_compaction_picker.CompactFiles( - comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + level_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); input_set.clear(); input_files.clear(); @@ -4013,9 +4177,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(universal_compaction_picker.CompactFiles( - comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); input_set.clear(); input_files.clear(); @@ -4028,7 +4193,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, ASSERT_EQ(enable_per_key_placement_, universal_compaction_picker.FilesRangeOverlapWithCompaction( input_files, 6, - Compaction::EvaluatePenultimateLevel( + Compaction::EvaluateProximalLevel( vstorage_.get(), mutable_cf_options_, ioptions_, 0, 6))); } @@ -4060,9 +4225,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) { ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(universal_compaction_picker.CompactFiles( - comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); input_set.clear(); input_files.clear(); @@ -4076,9 +4242,9 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) { input_files, 5, Compaction::kInvalidLevel)); } -TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) { +TEST_P(PerKeyPlacementCompactionPickerTest, ProximalOverlapUniversal) { // This test is make sure the Tiered compaction would lock whole range of - // both output level and penultimate level + // both output level and proximal level if (enable_per_key_placement_) { mutable_cf_options_.preclude_last_level_data_seconds = 10000; } @@ -4098,7 +4264,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) { UpdateVersionStorageInfo(); // the existing compaction is the 1st L4 file + L6 file - // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped + // then compaction of the 2nd L4 file to L5 (proximal level) is overlapped // when the tiered compaction feature is on. CompactionOptions comp_options; std::unordered_set input_set; @@ -4108,9 +4274,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) { ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(universal_compaction_picker.CompactFiles( - comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); input_set.clear(); input_files.clear(); @@ -4159,9 +4326,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) { ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(universal_compaction_picker.CompactFiles( - comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); // cannot compact file 41 if the preclude_last_level feature is on, otherwise // compact file 41 is okay. @@ -4187,9 +4355,9 @@ TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) { } TEST_P(PerKeyPlacementCompactionPickerTest, - LastLevelOnlyFailPenultimateUniversal) { + LastLevelOnlyFailProximalUniversal) { // This is to test last_level only compaction still unable to do the - // penultimate level compaction if there's already a file in the penultimate + // proximal level compaction if there's already a file in the proximal // level. // This should rarely happen in universal compaction, as the non-empty L5 // should be included in the compaction. @@ -4217,14 +4385,15 @@ TEST_P(PerKeyPlacementCompactionPickerTest, ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(universal_compaction_picker.CompactFiles( - comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); ASSERT_TRUE(comp1); - ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel); - // As comp1 cannot be output to the penultimate level, compacting file 40 to + // As comp1 cannot be output to the proximal level, compacting file 40 to // L5 is always safe. input_set.clear(); input_files.clear(); @@ -4235,18 +4404,19 @@ TEST_P(PerKeyPlacementCompactionPickerTest, ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( input_files, 5, Compaction::kInvalidLevel)); - std::unique_ptr comp2(universal_compaction_picker.CompactFiles( - comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp2( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); ASSERT_TRUE(comp2); - ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel()); } TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyConflictWithOngoingUniversal) { // This is to test last_level only compaction still unable to do the - // penultimate level compaction if there's already an ongoing compaction to - // the penultimate level + // proximal level compaction if there's already an ongoing compaction to + // the proximal level if (enable_per_key_placement_) { mutable_cf_options_.preclude_last_level_data_seconds = 10000; } @@ -4265,7 +4435,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, Add(6, 60U, "101", "351", 60000000U); UpdateVersionStorageInfo(); - // create an ongoing compaction to L5 (penultimate level) + // create an ongoing compaction to L5 (proximal level) CompactionOptions comp_options; std::unordered_set input_set; input_set.insert(40); @@ -4273,12 +4443,13 @@ TEST_P(PerKeyPlacementCompactionPickerTest, ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(universal_compaction_picker.CompactFiles( - comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); ASSERT_TRUE(comp1); - ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel); input_set.clear(); input_files.clear(); @@ -4289,15 +4460,16 @@ TEST_P(PerKeyPlacementCompactionPickerTest, ASSERT_EQ(enable_per_key_placement_, universal_compaction_picker.FilesRangeOverlapWithCompaction( input_files, 6, - Compaction::EvaluatePenultimateLevel( + Compaction::EvaluateProximalLevel( vstorage_.get(), mutable_cf_options_, ioptions_, 6, 6))); if (!enable_per_key_placement_) { - std::unique_ptr comp2(universal_compaction_picker.CompactFiles( - comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp2( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); ASSERT_TRUE(comp2); - ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel()); } } @@ -4306,7 +4478,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only // change is the ongoing compaction to L5 has no overlap with the last level // compaction, so it's safe to move data from the last level to the - // penultimate level. + // proximal level. if (enable_per_key_placement_) { mutable_cf_options_.preclude_last_level_data_seconds = 10000; } @@ -4325,7 +4497,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, Add(6, 60U, "101", "351", 60000000U); UpdateVersionStorageInfo(); - // create an ongoing compaction to L5 (penultimate level) + // create an ongoing compaction to L5 (proximal level) CompactionOptions comp_options; std::unordered_set input_set; input_set.insert(42); @@ -4333,12 +4505,13 @@ TEST_P(PerKeyPlacementCompactionPickerTest, ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage_.get(), comp_options)); - std::unique_ptr comp1(universal_compaction_picker.CompactFiles( - comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp1( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); ASSERT_TRUE(comp1); - ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel); + ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel); input_set.clear(); input_files.clear(); @@ -4349,18 +4522,19 @@ TEST_P(PerKeyPlacementCompactionPickerTest, // always safe to move data up ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction( input_files, 6, - Compaction::EvaluatePenultimateLevel(vstorage_.get(), mutable_cf_options_, - ioptions_, 6, 6))); + Compaction::EvaluateProximalLevel(vstorage_.get(), mutable_cf_options_, + ioptions_, 6, 6))); // 2 compactions can be run in parallel - std::unique_ptr comp2(universal_compaction_picker.CompactFiles( - comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, - mutable_db_options_, 0)); + std::unique_ptr comp2( + universal_compaction_picker.PickCompactionForCompactFiles( + comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_, + mutable_db_options_, 0)); ASSERT_TRUE(comp2); if (enable_per_key_placement_) { - ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + ASSERT_NE(Compaction::kInvalidLevel, comp2->GetProximalLevel()); } else { - ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel()); + ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel()); } } @@ -4417,7 +4591,7 @@ TEST_F(CompactionPickerTest, std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(num_levels - 2, compaction->start_level()); ASSERT_EQ(num_levels - 1, compaction->output_level()); @@ -4428,7 +4602,7 @@ TEST_F(CompactionPickerTest, level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(second_compaction); ASSERT_EQ(num_levels - 1, compaction->output_level()); ASSERT_EQ(num_levels - 2, compaction->start_level()); @@ -4475,7 +4649,7 @@ TEST_F(CompactionPickerTest, std::unique_ptr compaction(level_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction); ASSERT_EQ(num_levels - 3, compaction->start_level()); ASSERT_EQ(num_levels - 2, compaction->output_level()); @@ -4525,7 +4699,7 @@ TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) { std::unique_ptr compaction(compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(CompactionReason::kLevelL0FilesNum, compaction->compaction_reason()); @@ -4602,7 +4776,7 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpLargeDB) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); if (i == kMaxRuns) { // There are in total i + 1 > kMaxRuns sorted runs. // This triggers compaction ignoring size_ratio. @@ -4650,11 +4824,1203 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpSmallDB) { universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, - vstorage_.get(), &log_buffer_)); + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); ASSERT_EQ(nullptr, compaction); } } +TEST_F(CompactionPickerTest, StandaloneRangeDeletionOnlyPicksOlderFiles) { + NewVersionStorage(6, kCompactionStyleUniversal); + + // Create L0 files with overlapping ranges + // File 1: newest regular file (epoch 5), keys [100, 200] + Add(0, 1U, "100", "200", 1U, 0, 100, 100, 0, false, Temperature::kUnknown, + kUnknownOldestAncesterTime, kUnknownNewestKeyTime, Slice(), Slice(), 5); + + // File 2: standalone range deletion (epoch 4), keys [150, 250] + // This file should be marked as having only range deletions + Add(0, 2U, "150", "250", 1U, 0, 200, 200, 0, true, Temperature::kUnknown, + kUnknownOldestAncesterTime, kUnknownNewestKeyTime, Slice(), Slice(), 4); + + // Manually set file 2 as standalone range deletion + FileMetaData* range_del_file = file_map_[2U].first; + range_del_file->num_entries = 1; + range_del_file->num_range_deletions = 1; + ASSERT_TRUE(range_del_file->FileIsStandAloneRangeTombstone()); + + Add(4, 10U, "000", "400", 1U); + Add(5, 20U, "000", "400", 100); + + UpdateVersionStorageInfo(); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + ASSERT_TRUE(universal_compaction_picker.NeedsCompaction(vstorage_.get())); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); + + ASSERT_NE(nullptr, compaction); + ASSERT_EQ(2U, compaction->num_input_levels()); + // First input level should be L0 with only the standalone range del file + // (file 2) + ASSERT_EQ(0, compaction->level(0)); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_TRUE(compaction->input(0, 0)->FileIsStandAloneRangeTombstone()); + + // Second input level should be L4 with file 10 + ASSERT_EQ(4, compaction->level(1)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(10U, compaction->input(1, 0)->fd.GetNumber()); +} + +// Tests for full_history_ts_low parameter in compaction picker. +// The full_history_ts_low parameter is used to control bottommost file marking +// for compaction when user-defined timestamps (UDT) are enabled. + +// Level compaction tests for full_history_ts_low: +// These tests verify that bottommost files are correctly marked/unmarked +// for compaction based on their max timestamp relative to full_history_ts_low. + +TEST_F(CompactionPickerU64TsTest, + BottommostNotMarkedWhenTimestampAboveFullHistoryTsLow) { + // Test that bottommost files are NOT marked for compaction when their + // max timestamp is >= full_history_ts_low. This prevents infinite + // compaction loops where timestamp could not be collapsed. + NewVersionStorage(6, kCompactionStyleLevel); + + // File has max_ts = 1000, full_history_ts_low = 500 + // Since 1000 >= 500, the file should NOT be marked for compaction. + SetupBottommostFileWithTimestamps( + /*min_ts=*/500, /*max_ts=*/1000, /*full_history_ts_low_val=*/500, + /*oldest_snapshot_seqnum=*/50, /*out_full_history_ts_low=*/nullptr); + + // File's max_ts (1000) >= full_history_ts_low (500), so it should NOT + // be marked for bottommost compaction + ASSERT_TRUE(vstorage_->BottommostFilesMarkedForCompaction().empty()); +} + +TEST_F(CompactionPickerU64TsTest, + BottommostMarkedWhenTimestampBelowFullHistoryTsLow) { + // Test that bottommost files ARE marked for compaction when their + // max timestamp is < full_history_ts_low. + NewVersionStorage(6, kCompactionStyleLevel); + + // File has max_ts = 100, full_history_ts_low = 500 + // Since 100 < 500, the file SHOULD be marked for compaction. + SetupBottommostFileWithTimestamps( + /*min_ts=*/50, /*max_ts=*/100, /*full_history_ts_low_val=*/500, + /*oldest_snapshot_seqnum=*/50, /*out_full_history_ts_low=*/nullptr); + + // File's max_ts (100) < full_history_ts_low (500), so it SHOULD be + // marked for bottommost compaction + ASSERT_EQ(1U, vstorage_->BottommostFilesMarkedForCompaction().size()); + ASSERT_EQ(5, vstorage_->BottommostFilesMarkedForCompaction()[0].first); + ASSERT_EQ(1U, vstorage_->BottommostFilesMarkedForCompaction()[0] + .second->fd.GetNumber()); +} + +TEST_F(CompactionPickerU64TsTest, + BottommostNotMarkedWithEmptyFullHistoryTsLow) { + // Test that when full_history_ts_low is empty, files are still marked + // based on seqno condition (backward compatibility behavior). + NewVersionStorage(6, kCompactionStyleLevel); + + std::string ts_small = MakeU64Timestamp(500); + std::string ts_large = MakeU64Timestamp(1000); + + // Add a file at bottommost level with seqno < oldest_snapshot + Add(5, 1U, "100", "200", /*file_size=*/1000, /*path_id=*/0, + /*smallest_seq=*/10, /*largest_seq=*/40, + /*compensated_file_size=*/1000, + /*marked_for_compact=*/false, Temperature::kUnknown, + kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts_small, ts_large); + + // Update version storage with empty full_history_ts_low + UpdateVersionStorageInfo(); + + // Update oldest snapshot with empty full_history_ts_low + vstorage_->UpdateOldestSnapshot( + /*oldest_snapshot_seqnum=*/50, + /*allow_ingest_behind=*/false, + /*ucmp=*/ucmp_, + /*full_history_ts_low=*/""); + + // With empty full_history_ts_low and UDT enabled, the file should NOT be + // marked. When full_history_ts_low is empty, it means it was never set, + // effectively 0, which is smaller than any valid timestamp. Since the file's + // max_timestamp would be >= full_history_ts_low, it won't be marked. + ASSERT_EQ(0U, vstorage_->BottommostFilesMarkedForCompaction().size()); +} + +TEST_F(CompactionPickerU64TsTest, LevelPickCompactionWithFullHistoryTsLow) { + // Test that level compaction correctly passes full_history_ts_low + // and picks compaction appropriately + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + AddL0FilesWithTimestamps(/*ts1_val=*/100, /*ts2_val=*/200); + + UpdateVersionStorageInfo(); + + std::string full_history_ts_low = MakeU64Timestamp(150); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr, vstorage_.get(), + &log_buffer_, full_history_ts_low, /*require_max_output_level=*/false)); + + // Compaction should be picked for L0 files + ASSERT_NE(nullptr, compaction); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(0, compaction->start_level()); +} + +TEST_F(CompactionPickerU64TsTest, UniversalPickCompactionWithFullHistoryTsLow) { + // Test that universal compaction correctly accepts full_history_ts_low + constexpr uint64_t kFileSize = 100000; + + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + AddL0FilesWithTimestamps(/*ts1_val=*/100, /*ts2_val=*/200, kFileSize); + + UpdateVersionStorageInfo(); + + std::string full_history_ts_low = MakeU64Timestamp(150); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr, + vstorage_.get(), &log_buffer_, full_history_ts_low, + /*require_max_output_level=*/false)); + + // Universal compaction should be picked + ASSERT_NE(nullptr, compaction); + ASSERT_EQ(2U, compaction->num_input_files(0)); +} + +// ============================================================================ +// FIFO Ratio-Based Compaction Picker Unit Tests +// Tests the actual FIFOCompactionPicker with use_kv_ratio_compaction option +// (PickRatioBasedIntraL0Compaction path). +// ============================================================================ + +TEST_F(CompactionPickerTest, FIFORatioBasedCompactionFileCountThreshold) { + // Test three file count scenarios relative to trigger (= 4): + // - fewer than trigger: no compaction + // - exactly trigger: compaction fires + // - more than trigger: compaction fires, picks >= 2 files + + // Sub-test 1: fewer than trigger (3 files < trigger 4) -> no compaction + { + SetupFIFORatioBased(10 * 1024 * 1024, 1ULL * 1024 * 1024 * 1024, 4); + FIFOCompactionPicker picker(ioptions_, &icmp_); + Add(0, 1U, "100", "200", 64 * 1024); + Add(0, 2U, "200", "300", 64 * 1024); + Add(0, 3U, "300", "400", 64 * 1024); + AddBlobFile(100, 64ULL * 1024 * 1024); + AddBlobFile(101, 64ULL * 1024 * 1024); + AddBlobFile(102, 64ULL * 1024 * 1024); + + auto compaction = PickFIFOCompaction(picker); + ASSERT_EQ(nullptr, compaction.get()) + << "Should not compact when file count < trigger"; + } + + // Sub-test 2: exactly trigger (4 files = trigger 4) -> compaction fires + { + SetupFIFORatioBased(10 * 1024 * 1024, 1ULL * 1024 * 1024 * 1024, 4); + FIFOCompactionPicker picker(ioptions_, &icmp_); + Add(0, 1U, "100", "200", 64 * 1024); + Add(0, 2U, "200", "300", 32 * 1024); + Add(0, 3U, "300", "400", 48 * 1024); + Add(0, 4U, "400", "500", 96 * 1024); + // sst_ratio ~ 240KB/256MB ~ 0.001, target ~ 250KB + AddBlobFile(100, 64ULL * 1024 * 1024); + AddBlobFile(101, 64ULL * 1024 * 1024); + AddBlobFile(102, 64ULL * 1024 * 1024); + AddBlobFile(103, 64ULL * 1024 * 1024); + + auto compaction = PickFIFOCompaction(picker); + ASSERT_NE(nullptr, compaction.get()) + << "Should compact when file count == trigger"; + ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + } + + // Sub-test 3: more than trigger (8 files > trigger 4) -> compaction fires + { + SetupFIFORatioBased(100 * 1024 * 1024, 500ULL * 1024 * 1024, 4); + FIFOCompactionPicker picker(ioptions_, &icmp_); + Add(0, 1U, "100", "199", 64 * 1024); + Add(0, 2U, "200", "299", 32 * 1024); + Add(0, 3U, "300", "399", 48 * 1024); + Add(0, 4U, "400", "499", 96 * 1024); + Add(0, 5U, "500", "599", 64 * 1024); + Add(0, 6U, "600", "699", 48 * 1024); + Add(0, 7U, "700", "799", 64 * 1024); + Add(0, 8U, "800", "899", 64 * 1024); + for (uint64_t i = 0; i < 8; i++) { + AddBlobFile(100 + i, 50ULL * 1024 * 1024); + } + + auto compaction = PickFIFOCompaction(picker); + ASSERT_NE(nullptr, compaction.get()) + << "Should compact when file count > trigger"; + ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles, + compaction->compaction_reason()); + ASSERT_GE(compaction->num_input_files(0), 2); + } +} + +TEST_F(CompactionPickerTest, FIFORatioBasedCompactionNoBlobsFallback) { + // When total_blob == 0, sst_ratio = 1.0 and target becomes huge + // (max_data_files_size / trigger). With the tiered algorithm, the tier + // boundaries descend from target, and the lowest boundary where files + // can accumulate will be found. The algorithm should still work + // correctly (not crash) and produce a compaction at a low tier boundary. + SetupFIFORatioBased(10 * 1024 * 1024, 10ULL * 1024 * 1024 * 1024, 4); + FIFOCompactionPicker picker(ioptions_, &icmp_); + + // Small SST files, no blob files + Add(0, 1U, "100", "200", 64 * 1024); + Add(0, 2U, "200", "300", 64 * 1024); + Add(0, 3U, "300", "400", 64 * 1024); + Add(0, 4U, "400", "500", 64 * 1024); + + // No blob files added -- total_blob == 0 + + // With sst_ratio=1.0 and 10GB cap, target = 10GB/4 = 2.5GB. + // Tiered boundaries descend: 2.5GB, 625MB, ..., ~152KB, ~38KB, ... + // At boundary ~152KB, 4 files of 64KB accumulate to 256KB >= 152KB. + // The tiered algorithm finds a viable batch and compacts. + auto compaction = PickFIFOCompaction(picker); + ASSERT_NE(nullptr, compaction.get()); + ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles, + compaction->compaction_reason()); + ASSERT_GE(compaction->num_input_files(0), 2); +} + +TEST_F(CompactionPickerTest, FIFORatioBasedCompactionNoRecompaction) { + // When all files are at or above the target size (graduated), + // no re-compaction should happen. Files >= target are skipped at every + // tier boundary. + SetupFIFORatioBased(100 * 1024 * 1024, 500ULL * 1024 * 1024, 4); + FIFOCompactionPicker picker(ioptions_, &icmp_); + // Use max_compaction_bytes to set an explicit target of 256KB. + // Make all files >= 256KB so they are "graduated" (at or above target). + mutable_cf_options_.max_compaction_bytes = 256 * 1024; + + // All files at 300KB, which is >= target (256KB) -> graduated + Add(0, 1U, "100", "199", 300 * 1024); + Add(0, 2U, "200", "299", 300 * 1024); + Add(0, 3U, "300", "399", 300 * 1024); + Add(0, 4U, "400", "499", 300 * 1024); + + // All files are at/above target -> graduated -> no compaction. + auto compaction = PickFIFOCompaction(picker); + ASSERT_EQ(nullptr, compaction.get()); +} + +TEST_F(CompactionPickerTest, + FIFORatioBasedCompactionWithExplicitMaxCompactionBytes) { + // When max_compaction_bytes > 0, it overrides the auto-calculated target. + SetupFIFORatioBased(100 * 1024 * 1024, 10ULL * 1024 * 1024 * 1024, 4); + FIFOCompactionPicker picker(ioptions_, &icmp_); + // Explicitly set target to 256KB + mutable_cf_options_.max_compaction_bytes = 256 * 1024; + + // 6 small SST files + Add(0, 1U, "100", "199", 64 * 1024); + Add(0, 2U, "200", "299", 64 * 1024); + Add(0, 3U, "300", "399", 64 * 1024); + Add(0, 4U, "400", "499", 64 * 1024); + Add(0, 5U, "500", "599", 64 * 1024); + Add(0, 6U, "600", "699", 64 * 1024); + + // No blob files needed when max_compaction_bytes is explicitly set + + // target = max_compaction_bytes = 256KB. + // Tier boundaries descend from 256KB: [25KB, 256KB] (trigger=4, floor=10KB). + // At boundary 25KB: each 64KB file >= 25KB -> skipped. + // At boundary 256KB: all 64KB files < 256KB -> accumulated until >= 256KB. + auto compaction = PickFIFOCompaction(picker); + ASSERT_NE(nullptr, compaction.get()); + ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, FIFORatioBasedCompactionFallbackToOldPath) { + // When use_kv_ratio_compaction is false, PickIntraL0Compaction should + // fall through to the old PickCostBasedIntraL0Compaction path. + + // Sub-test 1: allow_compaction = false -> no intra-L0 at all + { + SetupFIFORatioBased(10 * 1024 * 1024, 0, 4, + /*allow_compaction=*/false, /*use_kv_ratio=*/false); + FIFOCompactionPicker picker(ioptions_, &icmp_); + Add(0, 1U, "100", "200", 64 * 1024); + Add(0, 2U, "200", "300", 64 * 1024); + Add(0, 3U, "300", "400", 64 * 1024); + Add(0, 4U, "400", "500", 64 * 1024); + + // Total size (256KB) < max_table_files_size (10MB), so no deletion. + // allow_compaction=false, so no intra-L0 either. + auto compaction = PickFIFOCompaction(picker); + ASSERT_EQ(nullptr, compaction.get()); + } + + // Sub-test 2: allow_compaction = true, use_kv_ratio = false + // -> falls through to old PickCostBasedIntraL0Compaction path + { + SetupFIFORatioBased(10 * 1024 * 1024, 0, 4, + /*allow_compaction=*/true, /*use_kv_ratio=*/false); + // The old path uses max_compaction_bytes to cap total input size. + // In production this is sanitized to target_file_size_base * 25, + // but tests bypass sanitization, so set it explicitly. + mutable_cf_options_.max_compaction_bytes = 64 * 1024 * 1024; // 64MB + FIFOCompactionPicker picker(ioptions_, &icmp_); + Add(0, 1U, "100", "200", 64 * 1024); + Add(0, 2U, "200", "300", 64 * 1024); + Add(0, 3U, "300", "400", 64 * 1024); + Add(0, 4U, "400", "500", 64 * 1024); + + // Total size (256KB) < max_table_files_size (10MB), so no deletion. + // allow_compaction=true and use_kv_ratio=false -> old path. + // 4 files >= trigger(4), per_del = 256KB/3 ~ 85KB < 1.1*WBS -> passes. + auto compaction = PickFIFOCompaction(picker); + ASSERT_NE(nullptr, compaction.get()) + << "Old path should compact when allow_compaction=true"; + ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles, + compaction->compaction_reason()); + } +} + +// ============================================================================ +// FIFO Option Validation Tests +// Tests that ColumnFamilyData::ValidateOptions rejects invalid configurations +// for use_kv_ratio_compaction. +// ============================================================================ + +TEST_F(CompactionPickerTest, FIFOOptionValidation) { + auto validate = [](std::function configure) { + ColumnFamilyOptions cf_opts; + cf_opts.compaction_style = kCompactionStyleFIFO; + cf_opts.compaction_options_fifo.allow_compaction = true; + cf_opts.compaction_options_fifo.use_kv_ratio_compaction = true; + cf_opts.compaction_options_fifo.max_data_files_size = + 1ULL * 1024 * 1024 * 1024; + cf_opts.num_levels = 1; + configure(cf_opts); + return ColumnFamilyData::ValidateOptions(DBOptions(), cf_opts); + }; + + // use_kv_ratio_compaction requires FIFO compaction style + ASSERT_TRUE(validate([](auto& o) { + o.compaction_style = kCompactionStyleLevel; + }).IsInvalidArgument()); + + // use_kv_ratio_compaction requires allow_compaction + ASSERT_TRUE(validate([](auto& o) { + o.compaction_options_fifo.allow_compaction = false; + }).IsInvalidArgument()); + + // use_kv_ratio_compaction requires max_data_files_size > 0 + ASSERT_TRUE(validate([](auto& o) { + o.compaction_options_fifo.max_data_files_size = 0; + }).IsInvalidArgument()); + + // Accepts multi-level (for migration from level/universal to FIFO) + ASSERT_OK(validate([](auto& o) { o.num_levels = 4; })); + + // Accepts valid single-level config + ASSERT_OK(validate([](auto& /*o*/) {})); + + // max_data_files_size < max_table_files_size is invalid when non-zero + ASSERT_TRUE(validate([](auto& o) { + o.compaction_options_fifo.use_kv_ratio_compaction = false; + o.compaction_options_fifo.max_data_files_size = 0; + o.compaction_options_fifo.max_table_files_size = + 1ULL * 1024 * 1024 * 1024; + o.compaction_options_fifo.max_data_files_size = + 500ULL * 1024 * 1024; + }).IsInvalidArgument()); + + // max_data_files_size == max_table_files_size is valid + ASSERT_OK(validate([](auto& o) { + o.compaction_options_fifo.use_kv_ratio_compaction = false; + o.compaction_options_fifo.max_data_files_size = 0; + o.compaction_options_fifo.max_table_files_size = 1ULL * 1024 * 1024 * 1024; + o.compaction_options_fifo.max_data_files_size = 1ULL * 1024 * 1024 * 1024; + })); +} + +// ============================================================================ +// FIFO Ratio-Based Compaction: Multi-Level Migration Graceful Skip +// Tests that PickRatioBasedIntraL0Compaction gracefully skips when non-L0 +// levels still contain files (e.g., during migration from level/universal +// to FIFO), and resumes once all data has been drained to L0. +// ============================================================================ + +TEST_F(CompactionPickerTest, FIFORatioBasedMultiLevelMigration) { + // Sub-case 1: During migration (non-L0 levels have files). + // Ratio-based intra-L0 compaction should be skipped. + { + SetupFIFORatioBased(/*max_table_files_size=*/100 * 1024 * 1024, + /*max_data_files_size=*/1ULL * 1024 * 1024 * 1024, + /*trigger=*/4, + /*allow_compaction=*/true, + /*use_kv_ratio=*/true, + /*num_levels=*/4); + FIFOCompactionPicker picker(ioptions_, &icmp_); + + Add(0, 1U, "100", "200", 64 * 1024); + Add(0, 2U, "200", "300", 64 * 1024); + Add(0, 3U, "300", "400", 64 * 1024); + Add(0, 4U, "400", "500", 64 * 1024); + Add(0, 5U, "500", "600", 64 * 1024); + Add(2, 10U, "100", "600", 50 * 1024 * 1024); + AddBlobFile(100, 64ULL * 1024 * 1024); + AddBlobFile(101, 64ULL * 1024 * 1024); + + auto compaction = PickFIFOCompaction(picker); + if (compaction != nullptr) { + if (compaction->compaction_reason() == + CompactionReason::kFIFOReduceNumFiles) { + // Cost-based path is fine; verify it's not ratio-based. + ASSERT_EQ(16 * 1024 * 1024, compaction->max_output_file_size()); + } + } + } + + // Sub-case 2: After migration (only L0 has files). + // Ratio-based compaction should resume normally. + { + SetupFIFORatioBased(/*max_table_files_size=*/100 * 1024 * 1024, + /*max_data_files_size=*/1ULL * 1024 * 1024 * 1024, + /*trigger=*/4, + /*allow_compaction=*/true, + /*use_kv_ratio=*/true, + /*num_levels=*/4); + FIFOCompactionPicker picker(ioptions_, &icmp_); + + Add(0, 1U, "100", "200", 64 * 1024); + Add(0, 2U, "200", "300", 32 * 1024); + Add(0, 3U, "300", "400", 48 * 1024); + Add(0, 4U, "400", "500", 96 * 1024); + AddBlobFile(100, 64ULL * 1024 * 1024); + AddBlobFile(101, 64ULL * 1024 * 1024); + AddBlobFile(102, 64ULL * 1024 * 1024); + AddBlobFile(103, 64ULL * 1024 * 1024); + + auto compaction = PickFIFOCompaction(picker); + ASSERT_NE(nullptr, compaction.get()) + << "Should compact when non-L0 levels are empty (migration complete)"; + ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + } +} + +// ============================================================================ +// FIFO TTL Compaction with Blob-Aware Estimation Tests +// Tests that PickTTLCompaction correctly estimates remaining data (SST + blob) +// in both single-level and multi-level FIFO configurations. +// ============================================================================ + +TEST_F(CompactionPickerTest, FIFOTTLBlobEstimationSingleLevel) { + // Single-level FIFO with TTL and max_data_files_size. + // After dropping expired L0 SSTs, the blob estimate should be proportional + // to the remaining SST fraction. + // + // Common setup: L0 = 4 files x 50KB = 200KB, files 3,4 expired. + // Remaining SST after drop = 100KB = 50%. + + auto run = [&](uint64_t blob_total, uint64_t limit, bool expect_ttl_fires) { + ioptions_.compaction_style = kCompactionStyleFIFO; + NewVersionStorage(1, kCompactionStyleFIFO); + mutable_cf_options_.compaction_options_fifo.max_table_files_size = limit; + mutable_cf_options_.compaction_options_fifo.max_data_files_size = limit; + mutable_cf_options_.compaction_options_fifo.allow_compaction = true; + mutable_cf_options_.ttl = 3600; + FIFOCompactionPicker picker(ioptions_, &icmp_); + + uint64_t recent_time = static_cast(time(nullptr)); + Add(0, 1U, "100", "200", 50 * 1024, 0, 100, 100, 0, false, + Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time); + Add(0, 2U, "200", "300", 50 * 1024, 0, 100, 100, 0, false, + Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time); + Add(0, 3U, "300", "400", 50 * 1024, 0, 100, 100, 0, false, + Temperature::kUnknown, kUnknownOldestAncesterTime, 1); + Add(0, 4U, "400", "500", 50 * 1024, 0, 100, 100, 0, false, + Temperature::kUnknown, kUnknownOldestAncesterTime, 1); + if (blob_total > 0) { + AddBlobFile(100, blob_total / 2); + AddBlobFile(101, blob_total / 2); + } + + auto compaction = PickFIFOCompaction(picker); + if (expect_ttl_fires) { + ASSERT_NE(nullptr, compaction.get()) + << "TTL compaction should fire when remaining data < limit"; + ASSERT_EQ(CompactionReason::kFIFOTtl, compaction->compaction_reason()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + } else { + if (compaction != nullptr) { + ASSERT_NE(CompactionReason::kFIFOTtl, compaction->compaction_reason()) + << "TTL should not fire when remaining data still exceeds limit"; + } + } + }; + + // Sub-case 1: Under limit after drop. + // blob=400KB, limit=500KB. + // effective = 100KB + (100KB/200KB)*400KB = 300KB < 500KB -> fires. + run(400 * 1024, 500 * 1024, /*expect_ttl_fires=*/true); + + // Sub-case 2: Over limit after drop. + // blob=4MB, limit=100KB. + // effective = 100KB + (100KB/200KB)*4MB ~ 2MB >> 100KB -> does NOT fire. + run(4ULL * 1024 * 1024, 100 * 1024, /*expect_ttl_fires=*/false); + + // Sub-case 3: No blob files. Falls back to SST-only estimation. + // blob=0, limit=150KB. remaining SST = 100KB < 150KB -> fires. + run(0, 150 * 1024, /*expect_ttl_fires=*/true); +} + +TEST_F(CompactionPickerTest, FIFOTTLBlobEstimationMultiLevel) { + // Multi-level FIFO (migration) with TTL and max_data_files_size. + // This is the ritical bug fix scenario: + // - L0 has some SSTs, L2 has legacy SSTs from migration + // - Blob files cover ALL levels + // - The estimation must use total SST across ALL levels (not just L0) + // to avoid inflating the blob proportion. + // + // Setup: + // L0: 4 files x 50KB = 200KB SST (files 3,4 expired) + // L2: 1 file x 200KB SST (legacy migration data) + // Total SST = 400KB + // Blob: 800KB total + // max_data_files_size = 1000KB + // Remaining SST after TTL drop = 400KB - 100KB = 300KB + // + // CORRECT (fixed): effective = 300KB + (300KB/400KB)*800KB = 300+600 = + // 900KB < 1000KB -> fires BUG (old): effective = 100KB + + // (100KB/200KB)*800KB = 100+400 = 500KB < 1000KB -> fires + // (coincidentally fires too, but with wrong estimate) + // + // To distinguish correct vs buggy behavior, use a limit that triggers the + // difference: set max_data_files_size = 850KB. + // CORRECT: effective = 300KB + (300KB/400KB)*800KB = 900KB > 850KB -> does + // NOT fire BUG: effective = 100KB + (100KB/200KB)*800KB = 500KB < 850KB + // -> fires (wrong!) + ioptions_.compaction_style = kCompactionStyleFIFO; + NewVersionStorage(4, kCompactionStyleFIFO); + mutable_cf_options_.compaction_options_fifo.max_table_files_size = + 850 * 1024; // match max_data_files_size + mutable_cf_options_.compaction_options_fifo.max_data_files_size = 850 * 1024; + mutable_cf_options_.compaction_options_fifo.allow_compaction = true; + mutable_cf_options_.ttl = 3600; + FIFOCompactionPicker picker(ioptions_, &icmp_); + + uint64_t recent_time = static_cast(time(nullptr)); + // L0 files: 2 recent, 2 expired + Add(0, 1U, "100", "200", 50 * 1024, 0, 100, 100, 0, false, + Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time); + Add(0, 2U, "200", "300", 50 * 1024, 0, 100, 100, 0, false, + Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time); + Add(0, 3U, "300", "400", 50 * 1024, 0, 100, 100, 0, false, + Temperature::kUnknown, kUnknownOldestAncesterTime, 1); + Add(0, 4U, "400", "500", 50 * 1024, 0, 100, 100, 0, false, + Temperature::kUnknown, kUnknownOldestAncesterTime, 1); + // L2 legacy migration file + Add(2, 10U, "100", "600", 200 * 1024); + // Blob files (associated with ALL levels) + AddBlobFile(100, 400 * 1024); + AddBlobFile(101, 400 * 1024); + + auto compaction = PickFIFOCompaction(picker); + // With correct all-levels estimation: + // remaining_sst_all = 400KB - 100KB(dropped) = 300KB + // effective = 300KB + (300KB/400KB)*800KB = 900KB > 850KB + // -> TTL should NOT fire (falls through to size-based) + if (compaction != nullptr) { + ASSERT_NE(CompactionReason::kFIFOTtl, compaction->compaction_reason()) + << "Multi-level FIFO: TTL should not fire when correct all-levels " + "blob estimation shows data still exceeds limit"; + } +} + +TEST_F(CompactionPickerTest, FIFOBlobAwareSizeDropping) { + // PickSizeCompaction with max_data_files_size should account for blob data. + // + // Sub-case 1: Single-level. SST = 200KB, blob = 500MB, limit = 200MB. + // effective_size ~ 500MB >> 200MB -> drops from L0. + { + SetupFIFORatioBased(/*max_table=*/200ULL * 1024 * 1024, + /*max_data=*/200ULL * 1024 * 1024, + /*trigger=*/4, + /*allow_compaction=*/true, + /*use_kv_ratio=*/false); + FIFOCompactionPicker picker(ioptions_, &icmp_); + + Add(0, 1U, "100", "199", 40 * 1024); + Add(0, 2U, "200", "299", 40 * 1024); + Add(0, 3U, "300", "399", 40 * 1024); + Add(0, 4U, "400", "499", 40 * 1024); + Add(0, 5U, "500", "599", 40 * 1024); + AddBlobFile(100, 100ULL * 1024 * 1024); + AddBlobFile(101, 100ULL * 1024 * 1024); + AddBlobFile(102, 100ULL * 1024 * 1024); + AddBlobFile(103, 100ULL * 1024 * 1024); + AddBlobFile(104, 100ULL * 1024 * 1024); + + auto compaction = PickFIFOCompaction(picker); + ASSERT_NE(nullptr, compaction.get()); + ASSERT_EQ(CompactionReason::kFIFOMaxSize, compaction->compaction_reason()); + ASSERT_GE(compaction->num_input_files(0), 1); + } + + // Sub-case 2: Multi-level (migration). L0=100KB, L2=150KB, blob=500KB. + // effective_size = 250KB + 500KB = 750KB > 400KB -> drops from L2. + { + ioptions_.compaction_style = kCompactionStyleFIFO; + NewVersionStorage(4, kCompactionStyleFIFO); + mutable_cf_options_.compaction_options_fifo.max_table_files_size = + 400 * 1024; + mutable_cf_options_.compaction_options_fifo.max_data_files_size = + 400 * 1024; + mutable_cf_options_.compaction_options_fifo.allow_compaction = true; + mutable_cf_options_.ttl = 0; + FIFOCompactionPicker picker(ioptions_, &icmp_); + + Add(0, 1U, "100", "200", 50 * 1024); + Add(0, 2U, "200", "300", 50 * 1024); + Add(2, 10U, "100", "300", 50 * 1024); + Add(2, 11U, "300", "500", 50 * 1024); + Add(2, 12U, "500", "700", 50 * 1024); + AddBlobFile(100, 250 * 1024); + AddBlobFile(101, 250 * 1024); + + auto compaction = PickFIFOCompaction(picker); + ASSERT_NE(nullptr, compaction.get()); + ASSERT_EQ(CompactionReason::kFIFOMaxSize, compaction->compaction_reason()); + ASSERT_EQ(2, compaction->start_level()); + ASSERT_GE(compaction->num_input_files(0), 1U); + } + + // Sub-case 3: Under limit. SST = 256KB, blob = 200MB, limit = 1GB. + // effective_size ~ 200MB < 1GB -> no dropping. + { + SetupFIFORatioBased(/*max_table=*/1ULL * 1024 * 1024 * 1024, + /*max_data=*/1ULL * 1024 * 1024 * 1024, + /*trigger=*/4, + /*allow_compaction=*/true, + /*use_kv_ratio=*/true); + FIFOCompactionPicker picker(ioptions_, &icmp_); + + Add(0, 1U, "100", "199", 64 * 1024); + Add(0, 2U, "200", "299", 64 * 1024); + Add(0, 3U, "300", "399", 64 * 1024); + Add(0, 4U, "400", "499", 64 * 1024); + AddBlobFile(100, 50ULL * 1024 * 1024); + AddBlobFile(101, 50ULL * 1024 * 1024); + AddBlobFile(102, 50ULL * 1024 * 1024); + AddBlobFile(103, 50ULL * 1024 * 1024); + + auto compaction = PickFIFOCompaction(picker); + if (compaction) { + ASSERT_NE(CompactionReason::kFIFOMaxSize, + compaction->compaction_reason()); + } + } +} + +// ============================================================================ +// FIFO Blob-Aware Score Computation Test +// Tests that ComputeCompactionScore includes blob sizes when +// max_data_files_size > 0. +// ============================================================================ + +TEST_F(CompactionPickerTest, FIFOBlobAwareScoreComputation) { + // Sub-case 1: With max_data_files_size, score includes blob sizes. + // SST = 100KB, blob = 500MB, max_data = 200MB -> score ~ 2.5 + { + ioptions_.compaction_style = kCompactionStyleFIFO; + NewVersionStorage(1, kCompactionStyleFIFO); + mutable_cf_options_.compaction_options_fifo.max_table_files_size = + 200ULL * 1024 * 1024; + mutable_cf_options_.compaction_options_fifo.max_data_files_size = + 200ULL * 1024 * 1024; + mutable_cf_options_.compaction_options_fifo.allow_compaction = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + + Add(0, 1U, "100", "199", 25 * 1024); + Add(0, 2U, "200", "299", 25 * 1024); + Add(0, 3U, "300", "399", 25 * 1024); + Add(0, 4U, "400", "499", 25 * 1024); + AddBlobFile(100, 500ULL * 1024 * 1024); + UpdateVersionStorageInfo(); + + double score = vstorage_->CompactionScore(0); + ASSERT_GT(score, 2.0) << "Score should reflect 500MB/200MB ~ 2.5"; + } + + // Sub-case 2: Without max_data_files_size, score ignores blobs. + // SST = 400KB < 1MB, blob = 500MB ignored -> score ~ 0.4 + { + ioptions_.compaction_style = kCompactionStyleFIFO; + NewVersionStorage(1, kCompactionStyleFIFO); + mutable_cf_options_.compaction_options_fifo.max_table_files_size = + 1ULL * 1024 * 1024; + mutable_cf_options_.compaction_options_fifo.max_data_files_size = 0; + mutable_cf_options_.compaction_options_fifo.allow_compaction = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + + Add(0, 1U, "100", "199", 100 * 1024); + Add(0, 2U, "200", "299", 100 * 1024); + Add(0, 3U, "300", "399", 100 * 1024); + Add(0, 4U, "400", "499", 100 * 1024); + AddBlobFile(100, 500ULL * 1024 * 1024); + UpdateVersionStorageInfo(); + + double score = vstorage_->CompactionScore(0); + ASSERT_LT(score, 1.0) + << "Score should be < 1 when only SST sizes are counted"; + } +} + +// ============================================================================ +// FIFO + BlobDB Intra-L0 Compaction Picking Tests +// +// These tests validate the tiered intra-L0 compaction picking algorithm +// over multiple flush/compaction cycles. Each round: +// 1. Add a flush file to the L0 file list +// 2. Rebuild VersionStorageInfo and call FIFOCompactionPicker::PickCompaction +// 3. If compaction is picked, update the file list accordingly +// 4. Repeat +// +// The compaction PICKING uses the real FIFOCompactionPicker -- this ensures +// the tests always match the production picking logic. The rest of the +// system (compaction execution, file metadata updates, FIFO dropping) is +// handled by test helpers, since wiring up the full compaction execution +// pipeline (CompactionJob, VersionEdit, etc.) would add significant +// complexity without testing the picking logic more thoroughly. +// +// ============================================================================ + +class FIFORatioBasedCompactionPickingTest : public CompactionPickerTest { + protected: + struct L0File { + uint64_t size; // SST file size in bytes + uint64_t blob_size; // Associated blob data size + uint64_t age; // Creation order (lower = older) + bool is_compacted; // Created by compaction (vs flush) + }; + + // Pick compaction using FIFOCompactionPicker. + // + // Rebuilds VersionStorageInfo from the files vector and calls + // PickCompaction on the given picker. Maps the returned + // Compaction's input files back to vector indices. + // + // Returns the picked indices, or empty if no compaction. + // Also returns the compaction reason via out-parameter. + std::vector PickCompactionFromFiles( + FIFOCompactionPicker& picker, const std::vector& files, + uint64_t max_table_files_size, uint64_t max_data_files_size, int trigger, + CompactionReason* out_reason = nullptr) { + // Rebuild VersionStorageInfo from the current file list + NewVersionStorage(1, kCompactionStyleFIFO); + mutable_cf_options_.compaction_options_fifo.max_table_files_size = + max_table_files_size; + mutable_cf_options_.compaction_options_fifo.max_data_files_size = + max_data_files_size; + mutable_cf_options_.compaction_options_fifo.allow_compaction = true; + mutable_cf_options_.compaction_options_fifo.use_kv_ratio_compaction = true; + mutable_cf_options_.level0_file_num_compaction_trigger = trigger; + + // Add files: newest first. Use descending file numbers so L0 sort + // (newest-first by epoch/seqno/file_number) matches our order. + uint32_t base_fn = static_cast(files.size()); + for (size_t i = 0; i < files.size(); i++) { + uint32_t fn = base_fn - static_cast(i); + std::string smallest = "k" + std::to_string(10000 + fn * 10); + std::string largest = "k" + std::to_string(10000 + fn * 10 + 9); + Add(0, fn, smallest.c_str(), largest.c_str(), files[i].size); + } + + // Add one blob file with the total blob size + uint64_t total_blob = 0; + for (const auto& f : files) { + total_blob += f.blob_size; + } + if (total_blob > 0) { + AddBlobFile(9999, total_blob); + } + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr, + vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/"")); + + if (!compaction) return {}; + + if (out_reason) { + *out_reason = compaction->compaction_reason(); + } + + // For size-based dropping (kFIFOMaxSize / kFIFOTtl), map input files + // back to sim indices, same as for intra-L0. + std::vector result; + for (size_t j = 0; j < compaction->num_input_files(0); j++) { + uint32_t fn = + static_cast(compaction->input(0, j)->fd.GetNumber()); + size_t idx = base_fn - fn; + result.push_back(idx); + } + + // Unregister so the picker allows the next compaction + picker.UnregisterCompaction(compaction.get()); + + return result; + } + + // Execute one compaction: merge input files into 1 output + void ExecuteCompaction(std::vector& files, + const std::vector& input_indices, + uint64_t& global_age) { + uint64_t output_size = 0; + uint64_t output_blob = 0; + for (size_t idx : input_indices) { + output_size += files[idx].size; + output_blob += files[idx].blob_size; + } + + size_t oldest_input_pos = 0; + for (size_t idx : input_indices) { + oldest_input_pos = std::max(oldest_input_pos, idx); + } + + std::vector sorted_indices = input_indices; + std::sort(sorted_indices.rbegin(), sorted_indices.rend()); + for (size_t idx : sorted_indices) { + files.erase(files.begin() + idx); + } + + size_t insert_pos = oldest_input_pos; + for (size_t idx : sorted_indices) { + if (idx < oldest_input_pos) insert_pos--; + } + insert_pos = std::min(insert_pos, files.size()); + files.insert(files.begin() + insert_pos, + {output_size, output_blob, global_age++, true}); + } + + // Compute statistics about compacted file sizes + struct FileStats { + uint64_t count; + uint64_t min_size; + uint64_t max_size; + double mean_size; + double cv; + }; + + FileStats ComputeStats(const std::vector& files, + bool compacted_only) { + std::vector sizes; + for (const auto& f : files) { + if (!compacted_only || f.is_compacted) { + sizes.push_back(f.size); + } + } + if (sizes.empty()) return {0, 0, 0, 0.0, 0.0}; + + uint64_t sum = 0; + uint64_t min_s = UINT64_MAX, max_s = 0; + for (uint64_t s : sizes) { + sum += s; + min_s = std::min(min_s, s); + max_s = std::max(max_s, s); + } + double mean = static_cast(sum) / sizes.size(); + + double variance = 0; + for (uint64_t s : sizes) { + double diff = static_cast(s) - mean; + variance += diff * diff; + } + variance /= sizes.size(); + double stddev = std::sqrt(variance); + double cv = mean > 0 ? stddev / mean : 0; + + return {sizes.size(), min_s, max_s, mean, cv}; + } + + // Track write amplification + struct WriteAmpTracker { + uint64_t bytes_flushed = 0; + uint64_t bytes_compacted = 0; + + double sst_write_amp() const { + return bytes_flushed > 0 + ? static_cast(bytes_flushed + bytes_compacted) / + bytes_flushed + : 1.0; + } + }; + + struct TestState { + std::vector files; + uint64_t global_age = 0; + WriteAmpTracker wa; + int compaction_count = 0; + uint64_t max_file_count_seen = 0; + }; + + using FlushGenerator = + std::function(int round)>; + + // Core test loop: flush -> pick -> execute -> repeat. + void RunFlushAndCompact(TestState& s, int num_rounds, int trigger, + uint64_t max_data_files_size, + const FlushGenerator& gen) { + ioptions_.compaction_style = kCompactionStyleFIFO; + FIFOCompactionPicker picker(ioptions_, &icmp_); + + // Use max_data_files_size for both limits. When max_data_files_size > 0, + // it takes precedence and max_table_files_size is ignored, but keeping + // them consistent avoids contradictory configurations. + const uint64_t max_table_files_size = max_data_files_size; + + for (int round = 0; round < num_rounds; round++) { + auto [sst_size, blob_size] = gen(round); + s.files.insert(s.files.begin(), + {sst_size, blob_size, s.global_age++, false}); + s.wa.bytes_flushed += sst_size; + + // Pick compaction. Handle both dropping and intra-L0 results. + CompactionReason reason; + auto inputs = + PickCompactionFromFiles(picker, s.files, max_table_files_size, + max_data_files_size, trigger, &reason); + if (!inputs.empty()) { + if (reason == CompactionReason::kFIFOMaxSize || + reason == CompactionReason::kFIFOTtl) { + // Size/TTL dropping: remove the picked files + std::vector sorted = inputs; + std::sort(sorted.rbegin(), sorted.rend()); + for (size_t idx : sorted) { + s.files.erase(s.files.begin() + idx); + } + } else { + // Intra-L0 compaction: merge picked files + uint64_t compaction_input = 0; + for (size_t idx : inputs) { + compaction_input += s.files[idx].size; + } + s.wa.bytes_compacted += compaction_input; + ExecuteCompaction(s.files, inputs, s.global_age); + s.compaction_count++; + } + } + s.max_file_count_seen = std::max(s.max_file_count_seen, + static_cast(s.files.size())); + } + } + + // Assertion helpers + void AssertFileCountBounded(const std::vector& files, + uint64_t max_count, uint64_t multiplier = 3) { + ASSERT_LE(files.size(), max_count * multiplier) + << "File count " << files.size() << " exceeds " + << max_count * multiplier; + } + + void AssertCompactedUniform(const std::vector& files, double max_cv) { + auto stats = ComputeStats(files, true); + if (stats.count >= 2) { + ASSERT_LE(stats.cv, max_cv) + << "Compacted CV=" << stats.cv << " exceeds " << max_cv + << " (min=" << stats.min_size << " max=" << stats.max_size + << " mean=" << stats.mean_size << " count=" << stats.count << ")"; + } + } + + void AssertLowWriteAmp(const WriteAmpTracker& wa, double max_wa = 3.0) { + ASSERT_LE(wa.sst_write_amp(), max_wa) + << "Write amp=" << wa.sst_write_amp() << " exceeds " << max_wa; + } + + void AssertStandardGoals(const TestState& s, uint64_t max_count, + double max_cv = 0.30, double max_wa = 3.0, + uint64_t file_mult = 3) { + AssertFileCountBounded(s.files, max_count, file_mult); + AssertCompactedUniform(s.files, max_cv); + AssertLowWriteAmp(s.wa, max_wa); + } + + // Verify that graduated files (>= target) are never picked for compaction. + void AssertGraduatedNotPicked(const std::vector& files, int trigger, + uint64_t max_data_files_size) { + ioptions_.compaction_style = kCompactionStyleFIFO; + FIFOCompactionPicker picker(ioptions_, &icmp_); + const uint64_t max_table_files_size = max_data_files_size; + + CompactionReason reason; + auto inputs = + PickCompactionFromFiles(picker, files, max_table_files_size, + max_data_files_size, trigger, &reason); + if (!inputs.empty() && reason == CompactionReason::kFIFOReduceNumFiles) { + // Compute target from the picker's perspective: we need to estimate + // it the same way the picker does. + uint64_t total_sst = 0, total_blob = 0; + for (const auto& f : files) { + total_sst += f.size; + total_blob += f.blob_size; + } + double sst_ratio = total_blob > 0 ? static_cast(total_sst) / + (total_sst + total_blob) + : 1.0; + uint64_t target = + static_cast(max_data_files_size * sst_ratio) / trigger; + + for (size_t idx : inputs) { + ASSERT_LT(files[idx].size, target) + << "Should not re-compact graduated file at index " << idx + << " size=" << files[idx].size << " target=" << target; + } + } + } +}; + +// Variable flush + FIFO dropping -- the full scenario. +// Variable SST sizes (32-128KB), variable blob sizes (32-96MB), with +// FIFO size-based dropping active. This covers constant flush, variable +// flush, and FIFO dropping behaviors in a single test. +TEST_F(FIFORatioBasedCompactionPickingTest, VariableFlushWithFIFODropping) { + const uint64_t kCap = 500ULL * 1024 * 1024; + Random rng(42); + TestState s; + RunFlushAndCompact(s, 200, /*trigger=*/10, kCap, [&](int) { + return std::make_pair((32 + rng.Next() % 97) * 1024ULL, + (32 + rng.Next() % 65) * 1024ULL * 1024); + }); + AssertStandardGoals(s, 10, /*max_cv=*/0.40); +} + +// Verify graduated files are never re-compacted. +// With the tiered algorithm, intermediate compacted files CAN be merged +// at higher tier boundaries (that's the whole point of tiering). But files +// that have reached the target size ("graduated") should never be picked. +TEST_F(FIFORatioBasedCompactionPickingTest, NoCascadingReCompaction) { + const uint64_t kCap = 10ULL * 1024 * 1024 * 1024; + TestState s; + RunFlushAndCompact(s, 200, /*trigger=*/10, kCap, [](int) { + return std::make_pair(64ULL * 1024, 64ULL * 1024 * 1024); + }); + + AssertGraduatedNotPicked(s.files, 10, kCap); + // Write amp should be bounded (k=2 tiers for this config, so wa <= 3+margin) + AssertLowWriteAmp(s.wa, 4.0); +} + +// Early memtable flush -- very small flushes +TEST_F(FIFORatioBasedCompactionPickingTest, EarlyMemtableFlush) { + const uint64_t kCap = 1ULL * 1024 * 1024 * 1024; + Random rng(123); + TestState s; + RunFlushAndCompact(s, 100, /*trigger=*/10, kCap, [&](int) { + uint64_t sst = (rng.Next() % 5 == 0) ? (64 + rng.Next() % 65) * 1024ULL + : (8 + rng.Next() % 25) * 1024ULL; + return std::make_pair(sst, 32ULL * 1024 * 1024); + }); + + AssertStandardGoals(s, 10, /*max_cv=*/0.50, /*max_wa=*/4.0, + /*file_mult=*/5); +} + +// Blob compression variation -- data per flush varies, shifting +// the SST/blob ratio. The target is recomputed on every PickCompaction call +// (no caching), so the picker naturally adapts to ratio changes. +TEST_F(FIFORatioBasedCompactionPickingTest, BlobCompressionVariation) { + const uint64_t kCap = 300ULL * 1024 * 1024; + Random rng(456); + TestState s; + RunFlushAndCompact(s, 150, /*trigger=*/10, kCap, [&](int) { + return std::make_pair(64ULL * 1024, + (20 + rng.Next() % 61) * 1024ULL * 1024); + }); + AssertCompactedUniform(s.files, 0.30); +} + +// Large target/flush ratio -- verify logarithmic write amp with tiering +TEST_F(FIFORatioBasedCompactionPickingTest, TieredLargeRatio) { + // target/flush ~ 1000x with trigger=10 -> k=3 tiers, write amp ~ 4. + // Without tiering (flat merge), write amp would be ~57x. + const uint64_t kCap = 10ULL * 1024 * 1024 * 1024; // 10GB + TestState s; + // SST = 1KB, blob = 1MB. sst_ratio ~ 0.001. + // target = 10GB * 0.001 / 10 = 1MB. ratio = 1MB/1KB = 1024. + // k = ceil(log_10(1024)) = 4. Tier boundaries: ~10KB, ~100KB, 1MB. + // (10KB floor means lowest boundary is 10KB, not 1KB) + RunFlushAndCompact(s, 500, /*trigger=*/10, kCap, [](int) { + return std::make_pair(1ULL * 1024, 1ULL * 1024 * 1024); + }); + + // Write amp should be logarithmic: k+1 = 4 (with 10KB floor, 3 tiers). + // Allow some margin for ramp-up and boundary effects. + AssertLowWriteAmp(s.wa, 6.0); + + // File count should be bounded: trigger * (k+1) ~ 10 * 4 = 40 + AssertFileCountBounded(s.files, 10, /*multiplier=*/6); +} + +// Tiered progression -- verify intermediate tiers form and merge up +TEST_F(FIFORatioBasedCompactionPickingTest, TieredProgression) { + // SST = 10KB, blob = 1MB, cap = 100MB, trigger=4. + // sst_ratio ~ 10KB/1010KB ~ 0.0099. + // target = 100MB * 0.0099 / 4 ~ 248KB. ratio ~ 25. + // k = ceil(log_4(25)) = ceil(2.32) = 3. Boundaries: ~16KB, ~62KB, ~248KB. + const uint64_t kCap = 100ULL * 1024 * 1024; + TestState s; + RunFlushAndCompact(s, 200, /*trigger=*/4, kCap, [](int) { + return std::make_pair(10ULL * 1024, 1ULL * 1024 * 1024); + }); + + // Should have compacted files at multiple tier sizes + auto stats = ComputeStats(s.files, true); + ASSERT_GE(stats.count, 1u) << "Should have at least one compacted file"; + + // Write amp should be bounded: k+1 = 4, plus margin + AssertLowWriteAmp(s.wa, 5.0); +} + +// Graduated files should never be re-compacted +TEST_F(FIFORatioBasedCompactionPickingTest, GraduatedFilesNotRecompacted) { + // Build a state with graduated files (>= target), then verify they are + // never selected for compaction. + const uint64_t kCap = 500ULL * 1024 * 1024; // 500MB + TestState s; + // SST = 64KB, blob = 50MB. sst_ratio ~ 0.00125. + // target = 500MB * 0.00125 / 4 ~ 156KB. + // k = ceil(log_4(156/64)) = ceil(log_4(2.44)) = 1. + RunFlushAndCompact(s, 60, /*trigger=*/4, kCap, [](int) { + return std::make_pair(64ULL * 1024, 50ULL * 1024 * 1024); + }); + + AssertGraduatedNotPicked(s.files, 4, kCap); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 427abb9eabc7..173e317a1006 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -38,7 +38,8 @@ class UniversalCompactionBuilder { const MutableDBOptions& mutable_db_options, const std::vector& existing_snapshots, const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage, - UniversalCompactionPicker* picker, LogBuffer* log_buffer) + UniversalCompactionPicker* picker, LogBuffer* log_buffer, + bool require_max_output_level, const std::string& full_history_ts_low) : ioptions_(ioptions), icmp_(icmp), cf_name_(cf_name), @@ -46,7 +47,11 @@ class UniversalCompactionBuilder { mutable_db_options_(mutable_db_options), vstorage_(vstorage), picker_(picker), - log_buffer_(log_buffer) { + log_buffer_(log_buffer), + require_max_output_level_(require_max_output_level), + allow_ingest_behind_(ioptions.cf_allow_ingest_behind || + ioptions.allow_ingest_behind), + full_history_ts_low_(full_history_ts_low) { assert(icmp_); const auto* ucmp = icmp_->user_comparator(); assert(ucmp); @@ -102,6 +107,174 @@ class UniversalCompactionBuilder { bool level_has_marked_standalone_rangedel; }; + unsigned int GetMaxNumFilesToCompactBasedOnMaxReadAmp( + const int file_num_compaction_trigger, const unsigned int ratio, + int* num_sr_not_compacted_output, int* max_num_runs_output) const { + assert(num_sr_not_compacted_output); + assert(max_num_runs_output); + int max_num_runs = + mutable_cf_options_.compaction_options_universal.max_read_amp; + if (max_num_runs < 0) { + // any value < -1 is not valid + assert(max_num_runs == -1); + // By default, fall back to `level0_file_num_compaction_trigger` + max_num_runs = file_num_compaction_trigger; + } else if (max_num_runs == 0) { + if (mutable_cf_options_.compaction_options_universal.stop_style == + kCompactionStopStyleTotalSize) { + // 0 means auto-tuning by RocksDB. We estimate max num run based on + // max_run_size, size_ratio and write buffer size: + // Assume the size of the lowest level size is equal to + // write_buffer_size. Each subsequent level is the max size without + // triggering size_ratio compaction. `max_num_runs` is the minimum + // number of levels required such that the target size of the + // largest level is at least `max_run_size_`. + max_num_runs = 1; + double cur_level_max_size = + static_cast(mutable_cf_options_.write_buffer_size); + double total_run_size = 0; + while (cur_level_max_size < static_cast(max_run_size_)) { + // This loop should not take too many iterations since + // cur_level_max_size at least doubles each iteration. + total_run_size += cur_level_max_size; + cur_level_max_size = (100.0 + ratio) / 100.0 * total_run_size; + ++max_num_runs; + } + } else { + // TODO: implement the auto-tune logic for this stop style + max_num_runs = file_num_compaction_trigger; + } + } else { + // max_num_runs > 0, it's the limit on the number of sorted run + } + + // Get the total number of sorted runs that are not being compacted + int num_sr_not_compacted = 0; + for (size_t i = 0; i < sorted_runs_.size(); i++) { + if (sorted_runs_[i].being_compacted == false && + !sorted_runs_[i].level_has_marked_standalone_rangedel) { + num_sr_not_compacted++; + } + } + + *num_sr_not_compacted_output = num_sr_not_compacted; + *max_num_runs_output = max_num_runs; + + if (num_sr_not_compacted > max_num_runs) { + return num_sr_not_compacted - max_num_runs + 1; + } else { + return 0; + } + } + + Compaction* MaybePickPeriodicCompaction(Compaction* const prev_picked_c) { + if (prev_picked_c != nullptr || + vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + return prev_picked_c; + } + // Always need to do a full compaction for periodic compaction. + Compaction* c = PickPeriodicCompaction(); + TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c); + if (c != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: picked for periodic compaction\n", + cf_name_.c_str()); + } + return c; + } + + Compaction* MaybePickSizeAmpCompaction(Compaction* const prev_picked_c, + int file_num_compaction_trigger) { + if (prev_picked_c != nullptr || + sorted_runs_.size() < + static_cast(file_num_compaction_trigger)) { + return prev_picked_c; + } + Compaction* c = PickCompactionToReduceSizeAmp(); + if (c != nullptr) { + TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr"); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: picked for size amp compaction \n", + cf_name_.c_str()); + } + return c; + } + + Compaction* MaybePickCompactionToReduceSortedRunsBasedFileRatio( + Compaction* const prev_picked_c, int file_num_compaction_trigger, + unsigned int ratio) { + if (prev_picked_c != nullptr || + sorted_runs_.size() < + static_cast(file_num_compaction_trigger)) { + return prev_picked_c; + } + Compaction* c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX); + if (c != nullptr) { + TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr"); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: picked for size ratio compaction to " + "reduce sorted run\n", + cf_name_.c_str()); + } + return c; + } + + Compaction* MaybePickCompactionToReduceSortedRuns( + Compaction* const prev_picked_c, int file_num_compaction_trigger, + unsigned int ratio) { + if (prev_picked_c != nullptr || + sorted_runs_.size() < + static_cast(file_num_compaction_trigger)) { + return prev_picked_c; + } + + int num_sr_not_compacted = 0; + int max_num_runs = 0; + const unsigned int max_num_files_to_compact = + GetMaxNumFilesToCompactBasedOnMaxReadAmp(file_num_compaction_trigger, + ratio, &num_sr_not_compacted, + &max_num_runs); + if (max_num_files_to_compact == 0) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: skipping compaction to reduce sorted run, num " + "sorted runs not " + "being compacted -- %u, max num runs allowed -- %d, max_run_size " + "-- %" PRIu64 "\n", + cf_name_.c_str(), num_sr_not_compacted, max_num_runs, max_run_size_); + return nullptr; + } + + Compaction* c = + PickCompactionToReduceSortedRuns(UINT_MAX, max_num_files_to_compact); + if (c != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: picked for sorted run num compaction " + "to reduce sorted run, to " + "compact file num -- %u, max num runs allowed" + "-- %d, max_run_size -- %" PRIu64 "\n", + cf_name_.c_str(), max_num_files_to_compact, max_num_runs, + max_run_size_); + } + return c; + } + + Compaction* MaybePickDeleteTriggeredCompaction( + Compaction* const prev_picked_c) { + if (prev_picked_c != nullptr) { + return prev_picked_c; + } + Compaction* c = PickDeleteTriggeredCompaction(); + if (c != nullptr) { + TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr"); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: picked for delete triggered compaction\n", + cf_name_.c_str()); + } + return c; + } + // Pick Universal compaction to limit read amplification Compaction* PickCompactionToReduceSortedRuns( unsigned int ratio, unsigned int max_number_of_files_to_compact); @@ -249,6 +422,12 @@ class UniversalCompactionBuilder { return num_l0_to_exclude; } + bool MeetsOutputLevelRequirements(int output_level) const { + return !require_max_output_level_ || + Compaction::OutputToNonZeroMaxOutputLevel( + output_level, vstorage_->MaxOutputLevel(allow_ingest_behind_)); + } + const ImmutableOptions& ioptions_; const InternalKeyComparator* icmp_; double score_; @@ -270,6 +449,9 @@ class UniversalCompactionBuilder { // marked for compaction. This is only populated when snapshot info is // populated. std::map file_marked_for_compaction_to_sorted_run_index_; + bool require_max_output_level_; + bool allow_ingest_behind_; + const std::string& full_history_ts_low_; std::vector CalculateSortedRuns( const VersionStorageInfo& vstorage, int last_level, @@ -288,7 +470,9 @@ class UniversalCompactionBuilder { // and the index of the file in that level struct InputFileInfo { - InputFileInfo() : f(nullptr), level(0), index(0) {} + InputFileInfo() : InputFileInfo(nullptr, 0, 0) {} + InputFileInfo(FileMetaData* file_meta, size_t l, size_t i) + : f(file_meta), level(l), index(i) {} FileMetaData* f; size_t level; @@ -321,22 +505,14 @@ SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) { SmallestKeyHeap smallest_key_priority_q = SmallestKeyHeap(SmallestKeyHeapComparator(ucmp)); - InputFileInfo input_file; - for (size_t l = 0; l < c->num_input_levels(); l++) { if (c->num_input_files(l) != 0) { if (l == 0 && c->start_level() == 0) { for (size_t i = 0; i < c->num_input_files(0); i++) { - input_file.f = c->input(0, i); - input_file.level = 0; - input_file.index = i; - smallest_key_priority_q.push(std::move(input_file)); + smallest_key_priority_q.emplace(c->input(0, i), 0, i); } } else { - input_file.f = c->input(l, 0); - input_file.level = l; - input_file.index = 0; - smallest_key_priority_q.push(std::move(input_file)); + smallest_key_priority_q.emplace(c->input(l, 0), l, 0); } } } @@ -374,7 +550,7 @@ bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) { auto comparator = icmp_->user_comparator(); int first_iter = 1; - InputFileInfo prev, curr, next; + InputFileInfo prev, curr; SmallestKeyHeap smallest_key_priority_q = create_level_heap(c, icmp_->user_comparator()); @@ -397,17 +573,10 @@ bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) { prev = curr; } - next.f = nullptr; - if (c->level(curr.level) != 0 && curr.index < c->num_input_files(curr.level) - 1) { - next.f = c->input(curr.level, curr.index + 1); - next.level = curr.level; - next.index = curr.index + 1; - } - - if (next.f) { - smallest_key_priority_q.push(std::move(next)); + smallest_key_priority_q.emplace(c->input(curr.level, curr.index + 1), + curr.level, curr.index + 1); } } return true; @@ -428,15 +597,20 @@ bool UniversalCompactionPicker::NeedsCompaction( return false; } +// TODO leverage full_history_ts_low in universal compaction picking. It could +// help reduce the same infinite compaction loop issue found in level +// compaction. Compaction* UniversalCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, const std::vector& existing_snapshots, const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage, - LogBuffer* log_buffer) { + LogBuffer* log_buffer, const std::string& full_history_ts_low, + bool require_max_output_level) { UniversalCompactionBuilder builder( ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options, - existing_snapshots, snapshot_checker, vstorage, this, log_buffer); + existing_snapshots, snapshot_checker, vstorage, this, log_buffer, + require_max_output_level, full_history_ts_low); return builder.PickCompaction(); } @@ -567,13 +741,20 @@ bool UniversalCompactionBuilder::ShouldSkipMarkedFile( Compaction* UniversalCompactionBuilder::PickCompaction() { const int kLevel0 = 0; score_ = vstorage_->CompactionScore(kLevel0); - int max_output_level = - vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind); + const int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_); + const int file_num_compaction_trigger = + mutable_cf_options_.level0_file_num_compaction_trigger; + const unsigned int ratio = + mutable_cf_options_.compaction_options_universal.size_ratio; + + if (max_output_level == 0 && + !MeetsOutputLevelRequirements(0 /* output_level */)) { + return nullptr; + } + max_run_size_ = 0; sorted_runs_ = CalculateSortedRuns(*vstorage_, max_output_level, &max_run_size_); - int file_num_compaction_trigger = - mutable_cf_options_.level0_file_num_compaction_trigger; if (sorted_runs_.size() == 0 || (vstorage_->FilesMarkedForPeriodicCompaction().empty() && @@ -585,6 +766,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { "UniversalCompactionBuilder::PickCompaction:Return", nullptr); return nullptr; } + VersionStorageInfo::LevelSummaryStorage tmp; ROCKS_LOG_BUFFER_MAX_SZ( log_buffer_, 3072, @@ -592,127 +774,22 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp)); Compaction* c = nullptr; - // Periodic compaction has higher priority than other type of compaction - // because it's a hard requirement. - if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { - // Always need to do a full compaction for periodic compaction. - c = PickPeriodicCompaction(); - TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c); - } - - if (c == nullptr && - sorted_runs_.size() >= static_cast(file_num_compaction_trigger)) { - // Check for size amplification. - if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { - TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr"); - ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", - cf_name_.c_str()); - } else { - // Size amplification is within limits. Try reducing read - // amplification while maintaining file size ratios. - unsigned int ratio = - mutable_cf_options_.compaction_options_universal.size_ratio; - - if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { - TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr"); - ROCKS_LOG_BUFFER(log_buffer_, - "[%s] Universal: compacting for size ratio\n", - cf_name_.c_str()); - } else { - // Size amplification and file size ratios are within configured limits. - // If max read amplification exceeds configured limits, then force - // compaction to reduce the number sorted runs without looking at file - // size ratios. - - // This is guaranteed by NeedsCompaction() - assert(sorted_runs_.size() >= - static_cast(file_num_compaction_trigger)); - int max_num_runs = - mutable_cf_options_.compaction_options_universal.max_read_amp; - if (max_num_runs < 0) { - // any value < -1 is not valid - assert(max_num_runs == -1); - // By default, fall back to `level0_file_num_compaction_trigger` - max_num_runs = file_num_compaction_trigger; - } else if (max_num_runs == 0) { - if (mutable_cf_options_.compaction_options_universal.stop_style == - kCompactionStopStyleTotalSize) { - // 0 means auto-tuning by RocksDB. We estimate max num run based on - // max_run_size, size_ratio and write buffer size: - // Assume the size of the lowest level size is equal to - // write_buffer_size. Each subsequent level is the max size without - // triggering size_ratio compaction. `max_num_runs` is the minimum - // number of levels required such that the target size of the - // largest level is at least `max_run_size_`. - max_num_runs = 1; - double cur_level_max_size = - static_cast(mutable_cf_options_.write_buffer_size); - double total_run_size = 0; - while (cur_level_max_size < static_cast(max_run_size_)) { - // This loop should not take too many iterations since - // cur_level_max_size at least doubles each iteration. - total_run_size += cur_level_max_size; - cur_level_max_size = (100.0 + ratio) / 100.0 * total_run_size; - ++max_num_runs; - } - } else { - // TODO: implement the auto-tune logic for this stop style - max_num_runs = file_num_compaction_trigger; - } - } else { - // max_num_runs > 0, it's the limit on the number of sorted run - } - // Get the total number of sorted runs that are not being compacted - int num_sr_not_compacted = 0; - for (size_t i = 0; i < sorted_runs_.size(); i++) { - if (sorted_runs_[i].being_compacted == false && - !sorted_runs_[i].level_has_marked_standalone_rangedel) { - num_sr_not_compacted++; - } - } - // The number of sorted runs that are not being compacted is greater - // than the maximum allowed number of sorted runs - if (num_sr_not_compacted > max_num_runs) { - unsigned int num_files = num_sr_not_compacted - max_num_runs + 1; - if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) != - nullptr) { - ROCKS_LOG_BUFFER(log_buffer_, - "[%s] Universal: compacting for file num, to " - "compact file num -- %u, max num runs allowed" - "-- %d, max_run_size -- %" PRIu64 "\n", - cf_name_.c_str(), num_files, max_num_runs, - max_run_size_); - } - } else { - ROCKS_LOG_BUFFER( - log_buffer_, - "[%s] Universal: skipping compaction for file num, num runs not " - "being compacted -- %u, max num runs allowed -- %d, max_run_size " - "-- %" PRIu64 "\n", - cf_name_.c_str(), num_sr_not_compacted, max_num_runs, - max_run_size_); - } - } - } - } - - if (c == nullptr) { - if ((c = PickDeleteTriggeredCompaction()) != nullptr) { - TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr"); - ROCKS_LOG_BUFFER(log_buffer_, - "[%s] Universal: delete triggered compaction\n", - cf_name_.c_str()); - } - } + c = MaybePickPeriodicCompaction(c); + c = MaybePickSizeAmpCompaction(c, file_num_compaction_trigger); + c = MaybePickCompactionToReduceSortedRunsBasedFileRatio( + c, file_num_compaction_trigger, ratio); + c = MaybePickCompactionToReduceSortedRuns(c, file_num_compaction_trigger, + ratio); + c = MaybePickDeleteTriggeredCompaction(c); if (c == nullptr) { TEST_SYNC_POINT_CALLBACK( "UniversalCompactionBuilder::PickCompaction:Return", nullptr); return nullptr; } - assert(c->output_level() <= - vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind)); + assert(c->output_level() <= vstorage_->MaxOutputLevel(allow_ingest_behind_)); + assert(MeetsOutputLevelRequirements(c->output_level())); if (mutable_cf_options_.compaction_options_universal.allow_trivial_move == true && @@ -754,7 +831,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files); picker_->RegisterCompaction(c); - vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_, + full_history_ts_low_); TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return", c); @@ -838,14 +916,16 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( if (sr->being_compacted) { ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s" - "[%d] being compacted, skipping", + "[%d] being compacted, skipping for compaction to " + "reduce sorted runs", cf_name_.c_str(), file_num_buf, loop); } else if (sr->level_has_marked_standalone_rangedel) { - ROCKS_LOG_BUFFER(log_buffer_, - "[%s] Universal: %s" - "[%d] has standalone range tombstone files marked for " - "compaction, skipping", - cf_name_.c_str(), file_num_buf, loop); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: %s" + "[%d] has standalone range tombstone files marked for " + "compaction, skipping for compaction to reduce sorted runs", + cf_name_.c_str(), file_num_buf, loop); } sr = nullptr; @@ -858,7 +938,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); ROCKS_LOG_BUFFER(log_buffer_, - "[%s] Universal: Possible candidate %s[%d].", + "[%s] Universal: Possible candidate for compaction to " + "reduce sorted runs %s[%d].", cf_name_.c_str(), file_num_buf, loop); } @@ -950,8 +1031,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( int start_level = sorted_runs_[start_index].level; int output_level; // last level is reserved for the files ingested behind - int max_output_level = - vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind); + int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_); if (first_index_after == sorted_runs_.size()) { output_level = max_output_level; } else if (sorted_runs_[first_index_after].level == 0) { @@ -960,6 +1040,10 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( output_level = sorted_runs_[first_index_after].level - 1; } + if (!MeetsOutputLevelRequirements(output_level)) { + return nullptr; + } + std::vector inputs(max_output_level + 1); for (size_t i = 0; i < inputs.size(); ++i) { inputs[i].level = start_level + static_cast(i); @@ -996,7 +1080,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction( inputs, output_level, - Compaction::EvaluatePenultimateLevel( + Compaction::EvaluateProximalLevel( vstorage_, mutable_cf_options_, ioptions_, start_level, output_level))) { return nullptr; @@ -1016,13 +1100,12 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( output_level, 1, enable_compression), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, enable_compression), - mutable_cf_options_.default_write_temperature, + Temperature::kUnknown, /* max_subcompactions */ 0, grandparents, /* earliest_snapshot */ std::nullopt, - /* snapshot_checker */ nullptr, - /* is manual */ false, /* trim_ts */ "", score_, - false /* deletion_compaction */, - /* l0_files_might_overlap */ true, compaction_reason); + /* snapshot_checker */ nullptr, compaction_reason, + /* trim_ts */ "", score_, + /* l0_files_might_overlap */ true); } // Look at overall size amplification. If size amplification @@ -1052,18 +1135,19 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); if (sr->being_compacted) { - ROCKS_LOG_BUFFER( - log_buffer_, - "[%s] Universal: stopping at sorted run undergoing compaction: " - "%s[%" ROCKSDB_PRIszt "]", - cf_name_.c_str(), file_num_buf, start_index - 1); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: stopping for size amp compaction at " + "sorted run undergoing compaction: " + "%s[%" ROCKSDB_PRIszt "]", + cf_name_.c_str(), file_num_buf, start_index - 1); } else if (sr->level_has_marked_standalone_rangedel) { - ROCKS_LOG_BUFFER( - log_buffer_, - "[%s] Universal: stopping at sorted run that has standalone range " - "tombstone files marked for compaction: " - "%s[%" ROCKSDB_PRIszt "]", - cf_name_.c_str(), file_num_buf, start_index - 1); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: stopping for size amp compaction at " + "sorted run that has " + "standalone range " + "tombstone files marked for compaction: " + "%s[%" ROCKSDB_PRIszt "]", + cf_name_.c_str(), file_num_buf, start_index - 1); } break; } @@ -1079,11 +1163,12 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { { const size_t num_l0_to_exclude = MightExcludeNewL0sToReduceWriteStop( num_l0_files, end_index, start_index, candidate_size); - ROCKS_LOG_BUFFER(log_buffer_, - "[%s] Universal: Excluding %" ROCKSDB_PRIszt - " latest L0 files to reduce potential write stop " - "triggered by `level0_stop_writes_trigger`", - cf_name_.c_str(), num_l0_to_exclude); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: Excluding for size amp compaction %" ROCKSDB_PRIszt + " latest L0 files to reduce potential write stop " + "triggered by `level0_stop_writes_trigger`", + cf_name_.c_str(), num_l0_to_exclude); } { @@ -1101,18 +1186,18 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { // size amplification = percentage of additional size if (candidate_size * 100 < ratio * base_sr_size) { - ROCKS_LOG_BUFFER( - log_buffer_, - "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 - " earliest-file-size %" PRIu64, - cf_name_.c_str(), candidate_size, base_sr_size); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: size amp compction not needed. " + "newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, base_sr_size); return nullptr; } else { - ROCKS_LOG_BUFFER( - log_buffer_, - "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 - " earliest-file-size %" PRIu64, - cf_name_.c_str(), candidate_size, base_sr_size); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: size amp compaction needed. " + "newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, base_sr_size); } // Since incremental compaction can't include more than second last // level, it can introduce penalty, compared to full compaction. We @@ -1345,7 +1430,7 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( // intra L0 compactions outputs could have overlap if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction( inputs, output_level, - Compaction::EvaluatePenultimateLevel( + Compaction::EvaluateProximalLevel( vstorage_, mutable_cf_options_, ioptions_, start_level, output_level))) { return nullptr; @@ -1363,14 +1448,13 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( true /* enable_compression */), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, true /* enable_compression */), - mutable_cf_options_.default_write_temperature, + Temperature::kUnknown, /* max_subcompactions */ 0, /* grandparents */ {}, /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr, - /* is manual */ false, - /* trim_ts */ "", score_, false /* deletion_compaction */, - /* l0_files_might_overlap */ true, - CompactionReason::kUniversalSizeAmplification); + CompactionReason::kUniversalSizeAmplification, + /* trim_ts */ "", score_, + /* l0_files_might_overlap */ true); } // Pick files marked for compaction. Typically, files are marked by @@ -1439,8 +1523,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { return nullptr; } - int max_output_level = - vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind); + int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_); // Pick the first non-empty level after the start_level for (output_level = start_level + 1; output_level <= max_output_level; output_level++) { @@ -1463,10 +1546,23 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { } assert(output_level <= max_output_level); + if (!MeetsOutputLevelRequirements(output_level)) { + return nullptr; + } + if (output_level != 0) { + // For standalone range deletion, we don't want to compact it with newer + // L0 files that it doesn't cover. + const FileMetaData* starting_l0_file = + (start_level == 0 && start_level_inputs.size() == 1 && + start_level_inputs.files[0]->FileIsStandAloneRangeTombstone()) + ? start_level_inputs.files[0] + : nullptr; + if (start_level == 0) { if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs, - output_level, nullptr)) { + output_level, nullptr, + starting_l0_file)) { return nullptr; } } @@ -1477,7 +1573,8 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { output_level_inputs.level = output_level; if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs, &output_level_inputs, - &parent_index, -1)) { + &parent_index, -1, false, + starting_l0_file)) { return nullptr; } inputs.push_back(start_level_inputs); @@ -1486,9 +1583,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { } if (picker_->FilesRangeOverlapWithCompaction( inputs, output_level, - Compaction::EvaluatePenultimateLevel( - vstorage_, mutable_cf_options_, ioptions_, start_level, - output_level))) { + Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_, + ioptions_, start_level, + output_level))) { return nullptr; } @@ -1514,13 +1611,11 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id, GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level), - mutable_cf_options_.default_write_temperature, + Temperature::kUnknown, /* max_subcompactions */ 0, grandparents, earliest_snapshot_, - snapshot_checker_, - /* is manual */ false, - /* trim_ts */ "", score_, false /* deletion_compaction */, - /* l0_files_might_overlap */ true, - CompactionReason::kFilesMarkedForCompaction); + snapshot_checker_, CompactionReason::kFilesMarkedForCompaction, + /* trim_ts */ "", score_, + /* l0_files_might_overlap */ true); } Compaction* UniversalCompactionBuilder::PickCompactionToOldest( @@ -1541,8 +1636,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( uint32_t path_id = GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); int start_level = sorted_runs_[start_index].level; - int max_output_level = - vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind); + int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_); std::vector inputs(max_output_level + 1); for (size_t i = 0; i < inputs.size(); ++i) { inputs[i].level = start_level + static_cast(i); @@ -1587,10 +1681,14 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( output_level = sorted_runs_[end_index + 1].level - 1; } + if (!MeetsOutputLevelRequirements(output_level)) { + return nullptr; + } + // intra L0 compactions outputs could have overlap if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction( inputs, output_level, - Compaction::EvaluatePenultimateLevel( + Compaction::EvaluateProximalLevel( vstorage_, mutable_cf_options_, ioptions_, start_level, output_level))) { return nullptr; @@ -1609,13 +1707,12 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( true /* enable_compression */), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, true /* enable_compression */), - mutable_cf_options_.default_write_temperature, + Temperature::kUnknown, /* max_subcompactions */ 0, /* grandparents */ {}, /* earliest_snapshot */ std::nullopt, - /* snapshot_checker */ nullptr, - /* is manual */ false, - /* trim_ts */ "", score_, false /* deletion_compaction */, - /* l0_files_might_overlap */ true, compaction_reason); + /* snapshot_checker */ nullptr, compaction_reason, + /* trim_ts */ "", score_, + /* l0_files_might_overlap */ true); } Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() { diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h index 18c0f27afbf4..175c11c9f0c3 100644 --- a/db/compaction/compaction_picker_universal.h +++ b/db/compaction/compaction_picker_universal.h @@ -18,12 +18,16 @@ class UniversalCompactionPicker : public CompactionPicker { UniversalCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} + + // If `require_max_output_level` is true, only pick compaction + // with max output level or return nullptr if no such compaction exists. Compaction* PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, const std::vector& existing_snapshots, const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override; + LogBuffer* log_buffer, const std::string& full_history_ts_low, + bool require_max_output_level = false) override; int MaxOutputLevel() const override { return NumberLevels() - 1; } bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc index d571dbbc0c5e..cb88c53d8f8d 100644 --- a/db/compaction/compaction_service_job.cc +++ b/db/compaction/compaction_service_job.cc @@ -41,7 +41,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( } compaction_input.cf_name = compaction->column_family_data()->GetName(); - compaction_input.snapshots = existing_snapshots_; + compaction_input.snapshots = job_context_->snapshot_seqs; compaction_input.has_begin = sub_compact->start.has_value(); compaction_input.begin = compaction_input.has_begin ? sub_compact->start->ToString() : ""; @@ -74,15 +74,27 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( compaction->column_family_data()->GetName().c_str(), job_id_, compaction_input.output_level, input_files_oss.str().c_str()); CompactionServiceJobInfo info( - dbname_, db_id_, db_session_id_, GetCompactionId(sub_compact), + dbname_, db_id_, db_session_id_, + compaction->column_family_data()->GetID(), + compaction->column_family_data()->GetName(), GetCompactionId(sub_compact), thread_pri_, compaction->compaction_reason(), compaction->is_full_compaction(), compaction->is_manual_compaction(), - compaction->bottommost_level()); + compaction->bottommost_level(), compaction->start_level(), + compaction->output_level()); + CompactionServiceScheduleResponse response = db_options_.compaction_service->Schedule(info, compaction_input_binary); switch (response.status) { case CompactionServiceJobStatus::kSuccess: break; + case CompactionServiceJobStatus::kAborted: + sub_compact->status = + Status::Aborted("Scheduling a remote compaction job was aborted"); + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Remote compaction was aborted at Schedule()", + compaction->column_family_data()->GetName().c_str(), job_id_); + return response.status; case CompactionServiceJobStatus::kFailure: sub_compact->status = Status::Incomplete( "CompactionService failed to schedule a remote compaction job."); @@ -102,6 +114,17 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( break; } + std::string debug_str_before_wait = + compaction->input_version()->DebugString(/*hex=*/true); + + // TODO: Update CompactionService API to support abort and resume + // functionality. Currently, remote compaction jobs cannot be aborted via + // AbortAllCompactions() because the CompactionService interface lacks methods + // to signal abort to remote workers and to properly resume after an abort. + // The API needs to be extended with: + // - A method to signal abort to running remote compaction jobs + // - A method to resume/re-enable scheduling after an abort is lifted + ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Waiting for remote compaction...", compaction->column_family_data()->GetName().c_str(), job_id_); @@ -110,6 +133,17 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( db_options_.compaction_service->Wait(response.scheduled_job_id, &compaction_result_binary); + if (compaction_status != CompactionServiceJobStatus::kSuccess) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] Wait() status is not kSuccess. " + "\nDebugString Before Wait():\n%s" + "\nDebugString After Wait():\n%s", + compaction->column_family_data()->GetName().c_str(), job_id_, + debug_str_before_wait.c_str(), + compaction->input_version()->DebugString(/*hex=*/true).c_str()); + } + if (compaction_status == CompactionServiceJobStatus::kUseLocal) { ROCKS_LOG_INFO( db_options_.info_log, @@ -118,6 +152,16 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( return compaction_status; } + if (compaction_status == CompactionServiceJobStatus::kAborted) { + sub_compact->status = + Status::Aborted("Waiting a remote compaction job was aborted"); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Remote compaction was aborted during Wait()", + compaction->column_family_data()->GetName().c_str(), + job_id_); + return compaction_status; + } + CompactionServiceResult compaction_result; s = CompactionServiceResult::Read(compaction_result_binary, &compaction_result); @@ -185,18 +229,24 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( } FileMetaData meta; - uint64_t file_size; - // FIXME: file_size should be part of CompactionServiceOutputFile so that - // we don't get DB corruption if the full file size has not been propagated - // back to the caller through the file system (which could have metadata - // lag or caching bugs). - s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); + uint64_t file_size = file.file_size; + + // TODO - Clean this up in the next release. + // For backward compatibility - in case the remote worker does not populate + // the file_size yet. If missing, continue to populate this from the file + // system. + if (file_size == 0) { + s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); + } + if (!s.ok()) { sub_compact->status = s; db_options_.compaction_service->OnInstallation( response.scheduled_job_id, CompactionServiceJobStatus::kFailure); return CompactionServiceJobStatus::kFailure; } + assert(file_size > 0); + meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, file.smallest_seqno, file.largest_seqno); meta.smallest.DecodeFrom(file.smallest_internal_key); @@ -208,19 +258,35 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( meta.file_checksum_func_name = file.file_checksum_func_name; meta.marked_for_compaction = file.marked_for_compaction; meta.unique_id = file.unique_id; - + meta.temperature = file.file_temperature; + meta.tail_size = + FileMetaData::CalculateTailSize(file_size, file.table_properties); auto cfd = compaction->column_family_data(); - sub_compact->Current().AddOutput(std::move(meta), - cfd->internal_comparator(), false, true, - file.paranoid_hash); - sub_compact->Current().UpdateTableProperties(file.table_properties); + CompactionOutputs* compaction_outputs = + sub_compact->Outputs(file.is_proximal_level_output); + assert(compaction_outputs); + compaction_outputs->AddOutput(std::move(meta), cfd->internal_comparator(), + false, true, file.paranoid_hash); + compaction_outputs->UpdateTableProperties(file.table_properties); + } + + // Set per-level stats + auto compaction_output_stats = + sub_compact->OutputStats(false /* is_proximal_level */); + assert(compaction_output_stats); + compaction_output_stats->Add( + compaction_result.internal_stats.output_level_stats); + if (compaction->SupportsPerKeyPlacement()) { + compaction_output_stats = + sub_compact->OutputStats(true /* is_proximal_level */); + assert(compaction_output_stats); + compaction_output_stats->Add( + compaction_result.internal_stats.proximal_level_stats); } + + // Set job stats sub_compact->compaction_job_stats = compaction_result.stats; - sub_compact->Current().SetNumOutputRecords( - compaction_result.stats.num_output_records); - sub_compact->Current().SetNumOutputFiles( - compaction_result.stats.num_output_files); - sub_compact->Current().AddBytesWritten(compaction_result.bytes_written); + RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, compaction_result.bytes_written); @@ -240,48 +306,38 @@ void CompactionServiceCompactionJob::RecordCompactionIOStats() { CompactionJob::RecordCompactionIOStats(); } -void CompactionServiceCompactionJob::UpdateCompactionJobStats( - const InternalStats::CompactionStats& stats) const { - compaction_job_stats_->elapsed_micros = stats.micros; - - // output information only in remote compaction - compaction_job_stats_->total_output_bytes = stats.bytes_written; - compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; - compaction_job_stats_->num_output_records = stats.num_output_records; - compaction_job_stats_->num_output_files = stats.num_output_files; - compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; -} - CompactionServiceCompactionJob::CompactionServiceCompactionJob( int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, const MutableDBOptions& mutable_db_options, const FileOptions& file_options, VersionSet* versions, const std::atomic* shutting_down, LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, - std::vector existing_snapshots, - std::shared_ptr table_cache, EventLogger* event_logger, - const std::string& dbname, const std::shared_ptr& io_tracer, + JobContext* job_context, std::shared_ptr table_cache, + EventLogger* event_logger, const std::string& dbname, + const std::shared_ptr& io_tracer, const std::atomic& manual_compaction_canceled, const std::string& db_id, const std::string& db_session_id, std::string output_path, const CompactionServiceInput& compaction_service_input, CompactionServiceResult* compaction_service_result) - : CompactionJob(job_id, compaction, db_options, mutable_db_options, - file_options, versions, shutting_down, log_buffer, nullptr, - output_directory, nullptr, stats, db_mutex, - db_error_handler, std::move(existing_snapshots), - kMaxSequenceNumber, nullptr, nullptr, - std::move(table_cache), event_logger, - compaction->mutable_cf_options().paranoid_file_checks, - compaction->mutable_cf_options().report_bg_io_stats, dbname, - &(compaction_service_result->stats), Env::Priority::USER, - io_tracer, manual_compaction_canceled, db_id, db_session_id, - compaction->column_family_data()->GetFullHistoryTsLow()), + : CompactionJob( + job_id, compaction, db_options, mutable_db_options, file_options, + versions, shutting_down, log_buffer, nullptr, output_directory, + nullptr, stats, db_mutex, db_error_handler, job_context, + std::move(table_cache), event_logger, + compaction->mutable_cf_options().paranoid_file_checks, + compaction->mutable_cf_options().report_bg_io_stats, dbname, + &(compaction_service_result->stats), Env::Priority::USER, io_tracer, + manual_compaction_canceled, CompactionJob::kCompactionAbortedFalse, + db_id, db_session_id, + compaction->column_family_data()->GetFullHistoryTsLow()), output_path_(std::move(output_path)), compaction_input_(compaction_service_input), compaction_result_(compaction_service_result) {} -void CompactionServiceCompactionJob::Prepare() { +void CompactionServiceCompactionJob::Prepare( + const CompactionProgress& compaction_progress, + log::Writer* compaction_progress_writer) { std::optional begin; if (compaction_input_.has_begin) { begin = compaction_input_.begin; @@ -290,7 +346,8 @@ void CompactionServiceCompactionJob::Prepare() { if (compaction_input_.has_end) { end = compaction_input_.end; } - CompactionJob::Prepare(std::make_pair(begin, end)); + CompactionJob::Prepare(std::make_pair(begin, end), compaction_progress, + compaction_progress_writer); } Status CompactionServiceCompactionJob::Run() { @@ -313,15 +370,14 @@ Status CompactionServiceCompactionJob::Run() { ProcessKeyValueCompaction(sub_compact); - compaction_stats_.stats.micros = - db_options_.clock->NowMicros() - start_micros; - compaction_stats_.stats.cpu_micros = - sub_compact->compaction_job_stats.cpu_micros; + uint64_t elapsed_micros = db_options_.clock->NowMicros() - start_micros; + internal_stats_.SetMicros(elapsed_micros); + internal_stats_.AddCpuMicros(elapsed_micros); RecordTimeToHistogram(stats_, COMPACTION_TIME, - compaction_stats_.stats.micros); + internal_stats_.output_level_stats.micros); RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, - compaction_stats_.stats.cpu_micros); + internal_stats_.output_level_stats.cpu_micros); Status status = sub_compact->status; IOStatus io_s = sub_compact->io_status; @@ -351,38 +407,45 @@ Status CompactionServiceCompactionJob::Run() { // Build Compaction Job Stats - // 1. Aggregate CompactionOutputStats into Internal Compaction Stats - // (compaction_stats_) and aggregate Compaction Job Stats - // (compaction_job_stats_) from the sub compactions - compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); - - // 2. Update the Output information in the Compaction Job Stats with - // aggregated Internal Compaction Stats. - UpdateCompactionJobStats(compaction_stats_.stats); - - // 3. Set fields that are not propagated as part of aggregations above + // 1. Aggregate internal stats and job stats for all subcompactions + // internal stats: sub_compact.proximal_level_outputs_.stats and + // sub_compact.compaction_outputs_.stats into + // internal_stats_.output_level_stats and + // internal_stats_.proximal_level_stats + // job-level stats: sub_compact.compaction_job_stats into compact.job_stats_ + // + // For remote compaction, there's only one subcompaction. + compact_->AggregateCompactionStats(internal_stats_, *job_stats_); + + // 2. Update job-level output stats with the aggregated internal_stats_ + // Please note that input stats will be updated by primary host when all + // subcompactions are finished + UpdateCompactionJobOutputStatsFromInternalStats(status, internal_stats_); + // and set fields that are not propagated as part of the update compaction_result_->stats.is_manual_compaction = c->is_manual_compaction(); compaction_result_->stats.is_full_compaction = c->is_full_compaction(); compaction_result_->stats.is_remote_compaction = true; - // 4. Update IO Stats that are not part of the aggregations above (bytes_read, - // bytes_written) + // 3. Update IO Stats that are not part of the the update above + // (bytes_read, bytes_written) RecordCompactionIOStats(); // Build Output + compaction_result_->internal_stats = internal_stats_; compaction_result_->output_level = compact_->compaction->output_level(); compaction_result_->output_path = output_path_; if (status.ok()) { for (const auto& output_file : sub_compact->GetOutputs()) { auto& meta = output_file.meta; compaction_result_->output_files.emplace_back( - MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, - meta.fd.largest_seqno, meta.smallest.Encode().ToString(), - meta.largest.Encode().ToString(), meta.oldest_ancester_time, - meta.file_creation_time, meta.epoch_number, meta.file_checksum, - meta.file_checksum_func_name, output_file.validator.GetHash(), - meta.marked_for_compaction, meta.unique_id, - *output_file.table_properties); + MakeTableFileName(meta.fd.GetNumber()), meta.fd.GetFileSize(), + meta.fd.smallest_seqno, meta.fd.largest_seqno, + meta.smallest.Encode().ToString(), meta.largest.Encode().ToString(), + meta.oldest_ancester_time, meta.file_creation_time, meta.epoch_number, + meta.file_checksum, meta.file_checksum_func_name, + output_file.validator.GetHash(), meta.marked_for_compaction, + meta.unique_id, *output_file.table_properties, + output_file.is_proximal_level, meta.temperature); } } @@ -482,6 +545,10 @@ static std::unordered_map {offsetof(struct CompactionServiceOutputFile, file_name), OptionType::kEncodedString, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"file_size", + {offsetof(struct CompactionServiceOutputFile, file_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"smallest_seqno", {offsetof(struct CompactionServiceOutputFile, smallest_seqno), OptionType::kUInt64T, OptionVerificationType::kNormal, @@ -554,7 +621,16 @@ static std::unordered_map const auto this_one = static_cast(addr1); const auto that_one = static_cast(addr2); return this_one->AreEqual(opts, that_one, mismatch); - }}}}; + }}}, + {"is_proximal_level_output", + {offsetof(struct CompactionServiceOutputFile, + is_proximal_level_output), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_temperature", + {offsetof(struct CompactionServiceOutputFile, file_temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}}; static std::unordered_map compaction_job_stats_type_info = { @@ -679,6 +755,125 @@ static std::unordered_map OptionTypeFlags::kNone}}, }; +static std::unordered_map + compaction_stats_type_info = { + {"micros", + {offsetof(struct InternalStats::CompactionStats, micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cpu_micros", + {offsetof(struct InternalStats::CompactionStats, cpu_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read_non_output_levels", + {offsetof(struct InternalStats::CompactionStats, + bytes_read_non_output_levels), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read_output_level", + {offsetof(struct InternalStats::CompactionStats, + bytes_read_output_level), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_skipped_non_output_levels", + {offsetof(struct InternalStats::CompactionStats, + bytes_skipped_non_output_levels), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_skipped_output_level", + {offsetof(struct InternalStats::CompactionStats, + bytes_skipped_output_level), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read_blob", + {offsetof(struct InternalStats::CompactionStats, bytes_read_blob), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_written", + {offsetof(struct InternalStats::CompactionStats, bytes_written), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_written_blob", + {offsetof(struct InternalStats::CompactionStats, bytes_written_blob), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_moved", + {offsetof(struct InternalStats::CompactionStats, bytes_moved), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files_in_non_output_levels", + {offsetof(struct InternalStats::CompactionStats, + num_input_files_in_non_output_levels), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files_in_output_level", + {offsetof(struct InternalStats::CompactionStats, + num_input_files_in_output_level), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_filtered_input_files_in_non_output_levels", + {offsetof(struct InternalStats::CompactionStats, + num_filtered_input_files_in_non_output_levels), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_filtered_input_files_in_output_level", + {offsetof(struct InternalStats::CompactionStats, + num_filtered_input_files_in_output_level), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files", + {offsetof(struct InternalStats::CompactionStats, num_output_files), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files_blob", + {offsetof(struct InternalStats::CompactionStats, + num_output_files_blob), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_records", + {offsetof(struct InternalStats::CompactionStats, num_input_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_dropped_records", + {offsetof(struct InternalStats::CompactionStats, num_dropped_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct InternalStats::CompactionStats, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"count", + {offsetof(struct InternalStats::CompactionStats, count), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"counts", OptionTypeInfo::Array< + int, static_cast(CompactionReason::kNumOfReasons)>( + offsetof(struct InternalStats::CompactionStats, counts), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kInt})}, +}; + +static std::unordered_map + compaction_internal_stats_type_info = { + {"output_level_stats", + OptionTypeInfo::Struct( + "output_level_stats", &compaction_stats_type_info, + offsetof(struct InternalStats::CompactionStatsFull, + output_level_stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"has_proximal_level_output", + {offsetof(struct InternalStats::CompactionStatsFull, + has_proximal_level_output), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"proximal_level_stats", + OptionTypeInfo::Struct( + "proximal_level_stats", &compaction_stats_type_info, + offsetof(struct InternalStats::CompactionStatsFull, + proximal_level_stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, +}; + namespace { // this is a helper struct to serialize and deserialize class Status, because // Status's members are not public. @@ -785,6 +980,11 @@ static std::unordered_map cs_result_type_info = { "stats", &compaction_job_stats_type_info, offsetof(struct CompactionServiceResult, stats), OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"internal_stats", + OptionTypeInfo::Struct( + "internal_stats", &compaction_internal_stats_type_info, + offsetof(struct CompactionServiceResult, internal_stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, }; Status CompactionServiceInput::Read(const std::string& data_str, diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc index 694466ce0c70..f76a25092974 100644 --- a/db/compaction/compaction_service_test.cc +++ b/db/compaction/compaction_service_test.cc @@ -4,9 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). #include "db/db_test_util.h" +#include "file/file_util.h" #include "port/stack_trace.h" #include "rocksdb/utilities/options_util.h" #include "table/unique_id_impl.h" +#include "utilities/merge_operators/string_append/stringappend.h" namespace ROCKSDB_NAMESPACE { @@ -15,17 +17,17 @@ class MyTestCompactionService : public CompactionService { MyTestCompactionService( std::string db_path, Options& options, std::shared_ptr& statistics, - std::vector>& listeners, + std::vector> listeners, std::vector> table_properties_collector_factories) : db_path_(std::move(db_path)), - options_(options), statistics_(statistics), - start_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown, - false, false, false), - wait_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown, - false, false, false), - listeners_(listeners), + options_(options), + start_info_("na", "na", "na", 0, "na", 0, Env::TOTAL, + CompactionReason::kUnknown, false, false, false, -1, -1), + wait_info_("na", "na", "na", 0, "na", 0, Env::TOTAL, + CompactionReason::kUnknown, false, false, false, -1, -1), + listeners_(std::move(listeners)), table_properties_collector_factories_( std::move(table_properties_collector_factories)) {} @@ -71,6 +73,31 @@ class MyTestCompactionService : public CompactionService { if (is_override_wait_status_) { return override_wait_status_; } + + CompactionServiceOptionsOverride options_override = GetOptionsOverride(); + + OpenAndCompactOptions options; + options.canceled = &canceled_; + + Status s = + DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id), + compaction_input, result, options_override); + { + InstrumentedMutexLock l(&mutex_); + if (is_override_wait_result_) { + *result = override_wait_result_; + } + result_ = *result; + } + compaction_num_.fetch_add(1); + if (s.ok()) { + return CompactionServiceJobStatus::kSuccess; + } else { + return CompactionServiceJobStatus::kFailure; + } + } + + CompactionServiceOptionsOverride GetOptionsOverride() { CompactionServiceOptionsOverride options_override; options_override.env = options_.env; options_override.file_checksum_gen_factory = @@ -84,6 +111,7 @@ class MyTestCompactionService : public CompactionService { options_override.table_factory = options_.table_factory; options_override.sst_partitioner_factory = options_.sst_partitioner_factory; options_override.statistics = statistics_; + options_override.info_log = options_.info_log; if (!listeners_.empty()) { options_override.listeners = listeners_; } @@ -92,26 +120,7 @@ class MyTestCompactionService : public CompactionService { options_override.table_properties_collector_factories = table_properties_collector_factories_; } - - OpenAndCompactOptions options; - options.canceled = &canceled_; - - Status s = - DB::OpenAndCompact(options, db_path_, db_path_ + "/" + scheduled_job_id, - compaction_input, result, options_override); - { - InstrumentedMutexLock l(&mutex_); - if (is_override_wait_result_) { - *result = override_wait_result_; - } - result_ = *result; - } - compaction_num_.fetch_add(1); - if (s.ok()) { - return CompactionServiceJobStatus::kSuccess; - } else { - return CompactionServiceJobStatus::kFailure; - } + return options_override; } void CancelAwaitingJobs() override { canceled_ = true; } @@ -158,14 +167,21 @@ class MyTestCompactionService : public CompactionService { return final_updated_status_.load(); } - private: + protected: InstrumentedMutex mutex_; - std::atomic_int compaction_num_{0}; + const std::string db_path_; + std::shared_ptr statistics_; std::map jobs_; std::map infos_; - const std::string db_path_; + std::string result_; + + std::string GetOutputPath(const std::string& scheduled_job_id) { + return db_path_ + "/" + scheduled_job_id; + } + + private: + std::atomic_int compaction_num_{0}; Options options_; - std::shared_ptr statistics_; CompactionServiceJobInfo start_info_; CompactionServiceJobInfo wait_info_; bool is_override_start_status_ = false; @@ -175,14 +191,15 @@ class MyTestCompactionService : public CompactionService { CompactionServiceJobStatus override_wait_status_ = CompactionServiceJobStatus::kFailure; bool is_override_wait_result_ = false; - std::string result_; std::string override_wait_result_; std::vector> listeners_; std::vector> table_properties_collector_factories_; - std::atomic_bool canceled_{false}; std::atomic final_updated_status_{ CompactionServiceJobStatus::kUseLocal}; + + protected: + std::atomic_bool canceled_{false}; }; class CompactionServiceTest : public DBTestBase { @@ -277,8 +294,17 @@ TEST_F(CompactionServiceTest, BasicCompactions) { Statistics* primary_statistics = GetPrimaryStatistics(); Statistics* compactor_statistics = GetCompactorStatistics(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::PrefetchTail::TaiSizeNotRecorded", + [&](void* /* arg */) { + // Trigger assertion to verify precise tail prefetch size calculation + assert(false); + }); + + SyncPoint::GetInstance()->EnableProcessing(); GenerateTestData(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); + SyncPoint::GetInstance()->DisableProcessing(); VerifyTestData(); auto my_cs = GetCompactionService(); @@ -357,11 +383,12 @@ TEST_F(CompactionServiceTest, BasicCompactions) { } else { ASSERT_OK(result.status); } - ASSERT_GE(result.stats.elapsed_micros, 1); - ASSERT_GE(result.stats.cpu_micros, 1); + ASSERT_GE(result.internal_stats.output_level_stats.micros, 1); + ASSERT_GE(result.internal_stats.output_level_stats.cpu_micros, 1); - ASSERT_EQ(20, result.stats.num_output_records); - ASSERT_EQ(result.output_files.size(), result.stats.num_output_files); + ASSERT_EQ(20, result.internal_stats.output_level_stats.num_output_records); + ASSERT_EQ(result.output_files.size(), + result.internal_stats.output_level_stats.num_output_files); uint64_t total_size = 0; for (auto output_file : result.output_files) { @@ -372,13 +399,14 @@ TEST_F(CompactionServiceTest, BasicCompactions) { ASSERT_GT(file_size, 0); total_size += file_size; } - ASSERT_EQ(total_size, result.stats.total_output_bytes); + ASSERT_EQ(total_size, result.internal_stats.TotalBytesWritten()); ASSERT_TRUE(result.stats.is_remote_compaction); ASSERT_TRUE(result.stats.is_manual_compaction); ASSERT_FALSE(result.stats.is_full_compaction); Close(); + SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(CompactionServiceTest, ManualCompaction) { @@ -422,6 +450,133 @@ TEST_F(CompactionServiceTest, ManualCompaction) { ASSERT_OK(result.status); ASSERT_TRUE(result.stats.is_manual_compaction); ASSERT_TRUE(result.stats.is_remote_compaction); + + auto info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(0, info.cf_id); + ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name); + + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(0, info.cf_id); + ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name); + + // Test non-default CF + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); + + info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(handles_[1]->GetID(), info.cf_id); + ASSERT_EQ(handles_[1]->GetName(), info.cf_name); + + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(handles_[1]->GetID(), info.cf_id); + ASSERT_EQ(handles_[1]->GetName(), info.cf_name); +} + +TEST_F(CompactionServiceTest, StandaloneDeleteRangeTombstoneOptimization) { + Options options = CurrentOptions(); + + size_t num_files_after_filtered = 0; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::MakeInputIterator:NewCompactionMergingIterator", + [&](void* arg) { + num_files_after_filtered = *static_cast(arg); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (auto compaction_style : {CompactionStyle::kCompactionStyleLevel, + CompactionStyle::kCompactionStyleUniversal}) { + SCOPED_TRACE("Style: " + std::to_string(compaction_style)); + options.compaction_style = compaction_style; + ReopenWithCompactionService(&options); + + num_files_after_filtered = 0; + + std::vector files; + { + // Writes first version of data in range partitioned files. + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file1 = dbname_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("a", "a1")); + ASSERT_OK(sst_file_writer.Put("b", "b1")); + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + files.push_back(std::move(file1)); + + std::string file2 = dbname_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.Put("x", "x1")); + ASSERT_OK(sst_file_writer.Put("y", "y1")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + } + + IngestExternalFileOptions ifo; + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_EQ(Get("a"), "a1"); + ASSERT_EQ(Get("b"), "b1"); + ASSERT_EQ(Get("x"), "x1"); + ASSERT_EQ(Get("y"), "y1"); + ASSERT_EQ(2, NumTableFilesAtLevel(6)); + + auto my_cs = GetCompactionService(); + uint64_t comp_num = my_cs->GetCompactionNum(); + + { + // Atomically delete old version of data with one range delete file. + // And a new batch of range partitioned files with new version of data. + files.clear(); + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file2 = dbname_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.DeleteRange("a", "z")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + + std::string file3 = dbname_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + ASSERT_OK(sst_file_writer.Put("a", "a2")); + ASSERT_OK(sst_file_writer.Put("b", "b2")); + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + files.push_back(std::move(file3)); + + std::string file4 = dbname_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + ASSERT_OK(sst_file_writer.Put("x", "x2")); + ASSERT_OK(sst_file_writer.Put("y", "y2")); + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + files.push_back(std::move(file4)); + } + + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_OK(db_->WaitForCompact(WaitForCompactOptions())); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); + + if (compaction_style == kCompactionStyleUniversal) { + ASSERT_EQ(num_files_after_filtered, 1); + } else { + // Not filtered + ASSERT_EQ(num_files_after_filtered, 3); + } + + Close(); + } + + SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(CompactionServiceTest, CompactionOutputFileIOError) { @@ -716,6 +871,119 @@ TEST_F(CompactionServiceTest, VerifyStatsLocalFallback) { VerifyTestData(); } +TEST_F(CompactionServiceTest, VerifyInputRecordCount) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + // Only iterator through 10 keys and force compaction to finish. + int num_iter = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction()::stop", [&](void* stop_ptr) { + num_iter++; + if (num_iter == 10) { + *(bool*)stop_ptr = true; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // CompactRange() should fail + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsCorruption()); + const char* expected_message = + "Compaction number of input keys does not match number of keys " + "processed."; + ASSERT_TRUE(std::strstr(s.getState(), expected_message)); + + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(CompactionServiceTest, EmptyResult) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + + // Delete range to cover entire range + ASSERT_OK(db_->DeleteRange(WriteOptions(), "key", "keyz")); + ASSERT_OK(Flush()); + + // In this unit test, both remote compaction and primary db instance are + // running in the same process, so NewFileNumber will never have a collision. + // In the real-world remote compactions, when the compaction is indeed running + // in another process, this is not going to be the case. + // To simulate the SST file with the same name created in the tmp directory, + // override the file number in remote compaction to re-use old SST file + // number. + bool need_to_override_file_number = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::OpenAndCompact::BeforeLoadingOptions:0", + [&](void*) { need_to_override_file_number = true; }); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile::NewFileNumber", + [&](void* file_number) { + if (need_to_override_file_number) { + auto n = static_cast(file_number); + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + for (const auto& level : cf_meta.levels) { + for (const auto& file : level.files) { + // Use one of the existing file name + *n = test::GetFileNumber(file.name); + need_to_override_file_number = false; + return; + } + } + } + }); + + // Inject failure, so that the remote compaction fails after + // ProcessKeyValueCompaction() + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) { + // override job status + auto s = static_cast(status); + *s = Status::Aborted("MyTestCompactionService failed to compact!"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Compaction should fail and SST files in the primary db should exist + { + ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (const auto& level : meta.levels) { + for (const auto& file : level.files) { + std::string fname = file.db_path + "/" + file.name; + ASSERT_OK(db_->GetEnv()->FileExists(fname)); + } + } + } + Close(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + TEST_F(CompactionServiceTest, CorruptedOutput) { Options options = CurrentOptions(); options.disable_auto_compactions = true; @@ -781,6 +1049,7 @@ TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) { Destroy(options); options.disable_auto_compactions = true; options.paranoid_file_checks = paranoid_file_check_enabled; + options.verify_output_flags = VerifyOutputFlags::kVerifyNone; ReopenWithCompactionService(&options); GenerateTestData(); @@ -835,6 +1104,87 @@ TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) { } } +TEST_F(CompactionServiceTest, CorruptedOutputVerifyOutputFlags) { + for (VerifyOutputFlags verify_output_flags : + {VerifyOutputFlags::kVerifyNone, + VerifyOutputFlags::kEnableForLocalCompaction | + VerifyOutputFlags::kVerifyBlockChecksum, + VerifyOutputFlags::kEnableForRemoteCompaction | + VerifyOutputFlags::kVerifyBlockChecksum, + VerifyOutputFlags::kEnableForRemoteCompaction | + VerifyOutputFlags::kVerifyIteration, + VerifyOutputFlags::kVerifyAll}) { + SCOPED_TRACE( + "verify_output_flags=" + + std::to_string(static_cast>( + verify_output_flags))); + + Options options = CurrentOptions(); + Destroy(options); + options.disable_auto_compactions = true; + options.paranoid_file_checks = false; + options.verify_output_flags = verify_output_flags; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionServiceCompactionJob::Run:0", [&](void* arg) { + CompactionServiceResult* compaction_result = + *(static_cast(arg)); + ASSERT_TRUE(compaction_result != nullptr && + !compaction_result->output_files.empty()); + // Corrupt files here + for (const auto& output_file : compaction_result->output_files) { + std::string file_name = + compaction_result->output_path + "/" + output_file.file_name; + + // Corrupt very small range of bytes. This corruption is so small + // that this isn't caught by default light-weight check + ASSERT_OK(test::CorruptFile(env_, file_name, 0, 1, + false /* verifyChecksum */)); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + const bool is_enabled_for_remote_compaction = + !!(verify_output_flags & VerifyOutputFlags::kEnableForRemoteCompaction); + const bool should_verify_block_checksum = + !!(verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum); + const bool should_verify_iteration = + !!(verify_output_flags & VerifyOutputFlags::kVerifyIteration); + + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + if (is_enabled_for_remote_compaction && + (should_verify_block_checksum || should_verify_iteration)) { + ASSERT_NOK(s); + ASSERT_TRUE(s.IsCorruption()); + } else { + // CompactRange() goes through if block checksum wasn't verified + ASSERT_OK(s); + } + + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // On the worker side, the compaction is considered success + // Verification is done on the primary side + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); + } +} + TEST_F(CompactionServiceTest, TruncatedOutput) { Options options = CurrentOptions(); options.disable_auto_compactions = true; @@ -849,6 +1199,12 @@ TEST_F(CompactionServiceTest, TruncatedOutput) { Slice end(end_str); uint64_t comp_num = my_cs->GetCompactionNum(); + // Skip calculating tail size to avoid crashing due to truncated file size + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FileMetaData::CalculateTailSize", [&](void* arg) { + bool* skip = static_cast(arg); + *skip = true; + }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionServiceCompactionJob::Run:0", [&](void* arg) { CompactionServiceResult* compaction_result = @@ -865,7 +1221,7 @@ TEST_F(CompactionServiceTest, TruncatedOutput) { ASSERT_OK(s); ASSERT_GT(file_size, 0); - ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 2)); + ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 4)); } }); SyncPoint::GetInstance()->EnableProcessing(); @@ -1024,8 +1380,9 @@ TEST_F(CompactionServiceTest, CancelCompactionOnPrimarySide) { // Primary DB calls CancelAllBackgroundWork() while the compaction is running SyncPoint::GetInstance()->SetCallBack( - "CompactionJob::Run():Inprogress", - [&](void* /*arg*/) { CancelAllBackgroundWork(db_, false /*wait*/); }); + "CompactionJob::Run():Inprogress", [&](void* /*arg*/) { + CancelAllBackgroundWork(db_.get(), false /*wait*/); + }); SyncPoint::GetInstance()->EnableProcessing(); @@ -1140,22 +1497,48 @@ TEST_F(CompactionServiceTest, CompactionFilter) { ASSERT_GE(my_cs->GetCompactionNum(), 1); } -TEST_F(CompactionServiceTest, Snapshot) { +TEST_F(CompactionServiceTest, MergeOperator) { Options options = CurrentOptions(); + options.merge_operator.reset(new StringAppendOperator(',')); ReopenWithCompactionService(&options); - - ASSERT_OK(Put(Key(1), "value1")); - ASSERT_OK(Put(Key(2), "value1")); - const Snapshot* s1 = db_->GetSnapshot(); - ASSERT_OK(Flush()); - - ASSERT_OK(Put(Key(1), "value2")); - ASSERT_OK(Put(Key(3), "value2")); - ASSERT_OK(Flush()); - + GenerateTestData(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int i = 0; i < 200; i++) { + ASSERT_OK(db_->Merge(WriteOptions(), Key(i), + "merge_op_append_" + std::to_string(i))); + } ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - auto my_cs = GetCompactionService(); - ASSERT_GE(my_cs->GetCompactionNum(), 1); + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + std::to_string(i) + ",merge_op_append_" + + std::to_string(i)); + } else { + ASSERT_EQ(result, "value_new" + std::to_string(i) + ",merge_op_append_" + + std::to_string(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); +} + +TEST_F(CompactionServiceTest, Snapshot) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(1), "value2")); + ASSERT_OK(Put(Key(3), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); ASSERT_EQ("value1", Get(Key(1), s1)); ASSERT_EQ("value2", Get(Key(1))); db_->ReleaseSnapshot(s1); @@ -1188,34 +1571,52 @@ TEST_F(CompactionServiceTest, PrecludeLastLevel) { for (int i = 0; i < kNumTrigger; i++) { for (int j = 0; j < kNumKeys; j++) { - // FIXME: need to assign outputs to levels to allow overlapping ranges: - // ASSERT_OK(Put(Key(j * kNumTrigger + i), "v" + std::to_string(i))); - // instead of this (too easy): - ASSERT_OK(Put(Key(i * kNumKeys + j), "v" + std::to_string(i))); + ASSERT_OK(Put(Key(j * kNumTrigger + i), "v" + std::to_string(i))); } ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); - // Data split between penultimate (kUnknown) and last (kCold) levels - // FIXME: need to assign outputs to levels to get this: - // ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); - // ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); - // ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); - // instead of this (WRONG but currently expected): - ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); - // Check manifest temperatures + // Data split between proximal (kUnknown) and last (kCold) levels + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); - ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + // TODO: Check FileSystem temperatures with FileTemperatureTestFS for (int i = 0; i < kNumTrigger; i++) { for (int j = 0; j < kNumKeys; j++) { - // FIXME - // ASSERT_EQ(Get(Key(j * kNumTrigger + i)), "v" + std::to_string(i)); - ASSERT_EQ(Get(Key(i * kNumKeys + j)), "v" + std::to_string(i)); + ASSERT_EQ(Get(Key(j * kNumTrigger + i)), "v" + std::to_string(i)); } } + + // Verify Output Stats + auto my_cs = GetCompactionService(); + { + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_GT(result.internal_stats.output_level_stats.cpu_micros, 0); + ASSERT_GT(result.internal_stats.output_level_stats.micros, 0); + ASSERT_EQ(result.internal_stats.output_level_stats.num_output_records + + result.internal_stats.proximal_level_stats.num_output_records, + kNumTrigger * kNumKeys); + ASSERT_EQ(result.internal_stats.output_level_stats.num_output_files + + result.internal_stats.proximal_level_stats.num_output_files, + 2); + + CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(0, info.base_input_level); + ASSERT_EQ(kNumLevels - 1, info.output_level); + } + SyncPoint::GetInstance()->DisableProcessing(); + // Disable Preclude feature and run full compaction to the bottommost level + { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(kNumLevels - 2, info.base_input_level); + ASSERT_EQ(kNumLevels - 1, info.output_level); + } } TEST_F(CompactionServiceTest, ConcurrentCompaction) { @@ -1285,12 +1686,17 @@ TEST_F(CompactionServiceTest, CompactionInfo) { ASSERT_EQ(true, info.is_manual_compaction); ASSERT_EQ(false, info.is_full_compaction); ASSERT_EQ(true, info.bottommost_level); + ASSERT_EQ(1, info.base_input_level); + ASSERT_EQ(2, info.output_level); info = my_cs->GetCompactionInfoForWait(); ASSERT_EQ(Env::USER, info.priority); ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason); ASSERT_EQ(true, info.is_manual_compaction); ASSERT_EQ(false, info.is_full_compaction); ASSERT_EQ(true, info.bottommost_level); + ASSERT_EQ(1, info.base_input_level); + ASSERT_EQ(2, info.output_level); + ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name); // Test priority BOTTOM env_->SetBackgroundThreads(1, Env::BOTTOM); @@ -1322,18 +1728,24 @@ TEST_F(CompactionServiceTest, CompactionInfo) { ASSERT_EQ(false, info.is_full_compaction); ASSERT_EQ(true, info.bottommost_level); ASSERT_EQ(Env::BOTTOM, info.priority); + ASSERT_EQ(0, info.base_input_level); + ASSERT_EQ(db_->NumberLevels() - 1, info.output_level); info = my_cs->GetCompactionInfoForWait(); ASSERT_EQ(Env::BOTTOM, info.priority); ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason); ASSERT_EQ(false, info.is_manual_compaction); ASSERT_EQ(false, info.is_full_compaction); ASSERT_EQ(true, info.bottommost_level); + ASSERT_EQ(0, info.base_input_level); + ASSERT_EQ(db_->NumberLevels() - 1, info.output_level); // Test Non-Bottommost Level options.num_levels = 4; ReopenWithCompactionService(&options); my_cs = static_cast_with_check(GetCompactionService()); + int compaction_num = my_cs->GetCompactionNum(); + ASSERT_EQ(0, compaction_num); for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) { for (int j = 0; j < 10; j++) { @@ -1342,16 +1754,22 @@ TEST_F(CompactionServiceTest, CompactionInfo) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // This is trivial move. Done locally. + ASSERT_EQ(0, my_cs->GetCompactionNum()); info = my_cs->GetCompactionInfoForStart(); ASSERT_EQ(false, info.is_manual_compaction); ASSERT_EQ(false, info.is_full_compaction); ASSERT_EQ(false, info.bottommost_level); + ASSERT_EQ(-1, info.base_input_level); + ASSERT_EQ(-1, info.output_level); info = my_cs->GetCompactionInfoForWait(); ASSERT_EQ(false, info.is_manual_compaction); ASSERT_EQ(false, info.is_full_compaction); ASSERT_EQ(false, info.bottommost_level); + ASSERT_EQ(-1, info.base_input_level); + ASSERT_EQ(-1, info.output_level); // Test Full Compaction + Bottommost Level options.num_levels = 6; @@ -1366,7 +1784,10 @@ TEST_F(CompactionServiceTest, CompactionInfo) { } ASSERT_OK(Flush()); } + MoveFilesToLevel(options.num_levels - 1); + // Force final level compaction + // base_input_level == output_level == last_level CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); @@ -1378,10 +1799,15 @@ TEST_F(CompactionServiceTest, CompactionInfo) { ASSERT_EQ(true, info.bottommost_level); ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason); info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(options.num_levels - 1, info.base_input_level); + ASSERT_EQ(options.num_levels - 1, info.output_level); ASSERT_EQ(true, info.is_manual_compaction); ASSERT_EQ(true, info.is_full_compaction); ASSERT_EQ(true, info.bottommost_level); ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason); + ASSERT_EQ(options.num_levels - 1, info.base_input_level); + ASSERT_EQ(options.num_levels - 1, info.output_level); + ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); } TEST_F(CompactionServiceTest, FallbackLocalAuto) { @@ -1471,6 +1897,40 @@ TEST_F(CompactionServiceTest, FallbackLocalManual) { VerifyTestData(); } +TEST_F(CompactionServiceTest, AbortedWhileWait) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + VerifyTestData(); + + auto my_cs = GetCompactionService(); + Statistics* compactor_statistics = GetCompactorStatistics(); + Statistics* primary_statistics = GetPrimaryStatistics(); + + my_cs->ResetOverride(); + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + + // Override Wait() result with kAborted + my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kAborted); + start_str = Key(120); + start = start_str; + + Status s = db_->CompactRange(CompactRangeOptions(), &start, nullptr); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsAborted()); + // no remote compaction is run + ASSERT_EQ(my_cs->GetCompactionNum(), 0); + // make sure the compaction statistics is not recorded any side + ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0); + ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0); +} + TEST_F(CompactionServiceTest, RemoteEventListener) { class RemoteEventListenerTest : public EventListener { public: @@ -1643,6 +2103,761 @@ TEST_F(CompactionServiceTest, TablePropertiesCollector) { ASSERT_TRUE(has_user_property); } +class ResumableCompactionService : public MyTestCompactionService { + public: + enum class TestScenario { + // Test scenario 1: Two-phase compaction with resumption + // - Phase 1: Cancel the compaction running with resumption enabled (saves + // progress) + // - Phase 2: Resume from saved progress and complete + // Validates: Resumption reduces redundant work + kCancelThenResume, + + // Test scenario 2: Two-phase compaction without resumption + // - Phase 1: Cancel the compaction running with resumption enabled (saves + // progress) + // - Phase 2: Start fresh without resumption (ignores saved progress) and + // complete + // Validates: Disabling resumption causes full reprocessing + kCancelThenFreshStart, + + // Test scenario 3: Three-phase compaction toggling resumption on/off/on + // - Phase 1: Cancel the compaction running with resumption enabled (saves + // progress) + // - Phase 2: Start fresh wtihout resumption (ignores saved progress) and + // cancel agains + // - Phase 3: Resume with resumption support (loads Phase 1's progress) and + // complete + // Validates: Resumption state can be toggled; + kMultipleCancelToggleResumption + }; + + ResumableCompactionService(const std::string& db_path, Options& options, + std::shared_ptr statistics, + TestScenario scenario) + : MyTestCompactionService(db_path, options, statistics, + {} /* listeners */, + {} /* table_properties_collector_factories */), + scenario_(scenario) {} + + // Set the user key where cancellation should happen. + void SetCancelAtKey(const std::string& key, SequenceNumber seqno) { + cancel_at_key_ = key; + cancel_at_seqno_ = seqno; + } + + CompactionServiceJobStatus Wait(const std::string& scheduled_job_id, + std::string* result) override { + std::string compaction_input = ExtractCompactionInput(scheduled_job_id); + EXPECT_FALSE(compaction_input.empty()); + + OpenAndCompactOptions open_and_compaction_options; + auto override_options = GetOptionsOverride(); + + // Force creation of one key per output file for test simplicity. + // ASSUMPTION: This makes stats.count directly proportional to keys + // processed. + SyncPoint::GetInstance()->SetCallBack( + "CompactionOutputs::ShouldStopBefore::manual_decision", + [this](void* p) { + auto* pair = static_cast*>(p); + *(pair->first) = true; // Force file cut at every key + + // If cancel_at_key_ is set, cancel when we encounter that key + if (!cancel_at_key_.empty() && !already_canceled_) { + ParsedInternalKey parsed_key; + if (ParseInternalKey(pair->second, &parsed_key, true).ok()) { + if (parsed_key.user_key.ToString() == cancel_at_key_) { + // Check sequence number if specified + if (cancel_at_seqno_ == kMaxSequenceNumber || + parsed_key.sequence == cancel_at_seqno_) { + canceled_ = true; + already_canceled_ = true; + } + } + } + } + }); + + // If no cancel_at_key_ is set, use the original behavior: + // Simulate cancelled compaction by overriding status at completion. So + // compaction processes all keys before this point to make stats.count + // comparison straightforward. + if (cancel_at_key_.empty()) { + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CompactWithoutInstallation::End", + [&](void* status) { + auto s = static_cast(status); + *s = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + }); + } + SyncPoint::GetInstance()->EnableProcessing(); + + // Phase 1: Run compaction with resumption enabled and cancel it + // - Processes input keys until cancellation point + // - Creates output files and saves progress + // - Status overridden to "paused" + open_and_compaction_options.allow_resumption = true; + open_and_compaction_options.canceled = &canceled_; + already_canceled_ = false; + canceled_ = false; + + auto phase1_stats = + RunCancelledCompaction(open_and_compaction_options, scheduled_job_id, + compaction_input, override_options); + + HistogramData phase2_stats; + + if (scenario_ == TestScenario::kMultipleCancelToggleResumption) { + // Phase 2: Run compaction WITHOUT resumption (fresh start) and cancel it + // - Delete all files left behind Phase 1 before calling OpenAndCompact() + // - Processes all input keys again from scratch + // - Creates output files but does NOT save progress + // - Status overridden to "paused" + open_and_compaction_options.allow_resumption = false; + + // Clean up output folder for fresh start + std::string output_dir = GetOutputPath(scheduled_job_id); + Status cleanup_status = DestroyDir(override_options.env, output_dir); + EXPECT_TRUE(cleanup_status.ok()); + EXPECT_OK(override_options.env->CreateDir(output_dir)); + + already_canceled_ = false; + canceled_ = false; + + phase2_stats = + RunCancelledCompaction(open_and_compaction_options, scheduled_job_id, + compaction_input, override_options); + + // Validation: Phase 2 starts from scratch, so it processes the same + // input keys as Phase 1. + // ASSUMPTION: With fixed input (10 keys) and deterministic cancellation + // (after processing), both phases create the same number of output files. + EXPECT_EQ(phase2_stats.count, phase1_stats.count); + } + + // Final phase: Run compaction to completion (no cancellation) + if (scenario_ == TestScenario::kMultipleCancelToggleResumption) { + // Attempt to resume but it ends up starting fresh + open_and_compaction_options.allow_resumption = true; + } else if (scenario_ == TestScenario::kCancelThenResume) { + // Resume from Phase 1's saved progress + open_and_compaction_options.allow_resumption = true; + } else { // kCancelThenFreshStart + // Start fresh without resumption + open_and_compaction_options.allow_resumption = false; + + // Clean up output folder for fresh start + std::string output_dir = GetOutputPath(scheduled_job_id); + Status cleanup_status = DestroyDir(override_options.env, output_dir); + EXPECT_TRUE(cleanup_status.ok()); + EXPECT_OK(override_options.env->CreateDir(output_dir)); + } + + // Prevent triggering of cancellation + SyncPoint::GetInstance()->ClearCallBack( + "DBImplSecondary::CompactWithoutInstallation::End"); + already_canceled_ = true; + canceled_ = false; + + auto final_phase_stats = + RunCompaction(open_and_compaction_options, scheduled_job_id, + compaction_input, override_options, result); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Validate statistics based on scenario (only when cancelling at end) + if (cancel_at_key_.empty()) { + if (scenario_ == TestScenario::kMultipleCancelToggleResumption) { + // ASSUMPTION: Phase 1 processes all keys before cancellation + EXPECT_GT(phase1_stats.count, 0); + + // ASSUMPTION: Phase 2 runs with allow_resumption=false and an empty + // folder. Phase 2 then creates its own output files (but doesn't save + // progress). When Phase 3 starts with allow_resumption=true, it finds + // no progress file exists, so it cannot resume and must start from + // scratch, processing all input keys again. Result: Phase 3 does the + // same amount of work as Phase 1. + EXPECT_EQ(final_phase_stats.count, phase1_stats.count); + + } else if (scenario_ == TestScenario::kCancelThenResume) { + // ASSUMPTION: Phase 1 processes all keys before cancellation + EXPECT_GT(phase1_stats.count, 0); + + // ASSUMPTION: Phase 1 processes all keys and saves progress before + // cancellation. Final phase resumes from Phase 1's saved progress. + // Since Phase 1 completed all processing before being cancelled, the + // final phase should do less work than Phase 1. + EXPECT_LT(final_phase_stats.count, phase1_stats.count); + + } else { // kCancelThenFreshStart + // ASSUMPTION: Phase 1 processes all keys before cancellation + EXPECT_GT(phase1_stats.count, 0); + + // ASSUMPTION: Final phase starts fresh without resumption, so it + // processes all input keys again and creates the same number of files + EXPECT_EQ(final_phase_stats.count, phase1_stats.count); + } + } + + StoreResult(*result); + + return CompactionServiceJobStatus::kSuccess; + } + + private: + std::string ExtractCompactionInput(const std::string& scheduled_job_id) { + InstrumentedMutexLock l(&mutex_); + + auto job_index = jobs_.find(scheduled_job_id); + if (job_index == jobs_.end()) { + return ""; + } + std::string compaction_input = std::move(job_index->second); + jobs_.erase(job_index); + + auto info_index = infos_.find(scheduled_job_id); + if (info_index == infos_.end()) { + return ""; + } + infos_.erase(info_index); + + return compaction_input; + } + + HistogramData RunCancelledCompaction( + const OpenAndCompactOptions& options, const std::string& scheduled_job_id, + const std::string& compaction_input, + const CompactionServiceOptionsOverride& override_options) { + std::string temp_result; + EXPECT_OK(statistics_->Reset()); + + Status s = + DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id), + compaction_input, &temp_result, override_options); + + EXPECT_TRUE(s.IsManualCompactionPaused()); + + HistogramData stats; + statistics_->histogramData(FILE_WRITE_COMPACTION_MICROS, &stats); + return stats; + } + + HistogramData RunCompaction( + const OpenAndCompactOptions& options, const std::string& scheduled_job_id, + const std::string& compaction_input, + const CompactionServiceOptionsOverride& override_options, + std::string* result) { + EXPECT_OK(statistics_->Reset()); + + Status s = + DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id), + compaction_input, result, override_options); + + EXPECT_TRUE(s.ok()); + + HistogramData stats; + statistics_->histogramData(FILE_WRITE_COMPACTION_MICROS, &stats); + return stats; + } + + void StoreResult(const std::string& result) { + InstrumentedMutexLock l(&mutex_); + result_ = result; + } + + TestScenario scenario_; + std::string cancel_at_key_; + SequenceNumber cancel_at_seqno_ = kMaxSequenceNumber; + std::atomic already_canceled_{false}; +}; + +class ResumableCompactionServiceTest : public CompactionServiceTest { + public: + explicit ResumableCompactionServiceTest() : CompactionServiceTest() {} + + void RunCompactionCancelTest( + ResumableCompactionService::TestScenario scenario) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + std::shared_ptr statistics = CreateDBStatistics(); + + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + BlockBasedTableOptions table_options; + table_options.verify_compression = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + auto resume_cs = std::make_shared( + dbname_, options, statistics, scenario); + options.compaction_service = resume_cs; + + DestroyAndReopen(options); + + GenerateTestData(); + + ASSERT_OK(statistics->Reset()); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + Status s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(s); + + VerifyTestData(); + + s = db_->VerifyChecksum(); + ASSERT_OK(s); + + s = db_->VerifyFileChecksums(ReadOptions()); + ASSERT_OK(s); + + CompactionServiceResult result; + resume_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); + ASSERT_GT(result.output_files.size(), 0); + + uint64_t resumed_bytes = + statistics->getTickerCount(REMOTE_COMPACT_RESUMED_BYTES); + if (scenario == + ResumableCompactionService::TestScenario::kCancelThenResume) { + // When resuming compaction, some bytes should be resumed from previous + // progress + ASSERT_GT(resumed_bytes, 0); + } else if (scenario == ResumableCompactionService::TestScenario:: + kCancelThenFreshStart) { + // When starting fresh (ignoring existing progress), no bytes should be + // resumed + ASSERT_EQ(resumed_bytes, 0); + } else { // kMultipleCancelToggleResumption + // Phase 2 ran without resumption (fresh start), so Phase 3 has no + // progress to resume from. It processes all keys again from scratch. + ASSERT_EQ(resumed_bytes, 0); + } + } + + void GenerateTestData() { + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), "value")); + ASSERT_OK(Flush()); + if (i % 2 == 0) { + ASSERT_OK(Delete(Key(i))); + ASSERT_OK(Flush()); + } + } + } + + void VerifyTestData() { + for (int i = 0; i < kNumKeys; ++i) { + if (i % 2 == 0) { + ASSERT_EQ("NOT_FOUND", Get((Key(i)))); + } else { + ASSERT_EQ("value", Get((Key(i)))); + } + } + } + + private: + static constexpr int kNumKeys = 10; +}; + +TEST_F(ResumableCompactionServiceTest, CompactionCancelThenResume) { + RunCompactionCancelTest( + ResumableCompactionService::TestScenario::kCancelThenResume); +} + +TEST_F(ResumableCompactionServiceTest, CompactionCancelThenFreshStart) { + RunCompactionCancelTest( + ResumableCompactionService::TestScenario::kCancelThenFreshStart); +} + +TEST_F(ResumableCompactionServiceTest, + CompactionMultipleCancelToggleResumption) { + RunCompactionCancelTest(ResumableCompactionService::TestScenario:: + kMultipleCancelToggleResumption); +} + +class ResumableCompactionKeyTypeTest : public CompactionServiceTest { + public: + explicit ResumableCompactionKeyTypeTest() : CompactionServiceTest() {} + + protected: + void SetupResumableCompactionService( + Options& options, const std::string& cancel_at_key = "", + SequenceNumber cancel_at_seqno = kMaxSequenceNumber) { + options.disable_auto_compactions = true; + statistics_ = CreateDBStatistics(); + + resume_cs_ = std::make_shared( + dbname_, options, statistics_, + ResumableCompactionService::TestScenario::kCancelThenResume); + + if (!cancel_at_key.empty()) { + resume_cs_->SetCancelAtKey(cancel_at_key, cancel_at_seqno); + } + + options.compaction_service = resume_cs_; + DestroyAndReopen(options); + } + + void ResetStatistics() { ASSERT_OK(statistics_->Reset()); } + + void VerifyResumeBytes() { + uint64_t resumed_bytes = + statistics_->getTickerCount(REMOTE_COMPACT_RESUMED_BYTES); + ASSERT_GT(resumed_bytes, 0); + } + + private: + std::shared_ptr resume_cs_; + std::shared_ptr statistics_; +}; + +// Cancel compaction right before processing key "c" to test resumption at a +// deletion at the non-bottom level. When resumed, compaction will continue +// from this deletion. +TEST_F(ResumableCompactionKeyTypeTest, + CancelAndResumeWithDeleteAtNonBottomLevel) { + Options options = CurrentOptions(); + + SetupResumableCompactionService(options, "c"); + + ASSERT_OK(Put("c", "old_value")); + ASSERT_OK(Put("c_placeholder", "placeholder")); + ASSERT_OK(Flush()); + MoveFilesToLevel(options.num_levels - 1); + + ASSERT_OK(Put("a", "val1")); + ASSERT_OK(Put("b", "val2")); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Delete("c")); + ASSERT_OK(Flush()); + + std::vector input_files; + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + + for (const auto& file : cf_meta.levels[0].files) { + input_files.push_back(file.name); + } + + ASSERT_EQ(input_files.size(), 2); + + ResetStatistics(); + + CompactionOptions compact_options; + ASSERT_OK( + db_->CompactFiles(compact_options, input_files, 1 /* output_level*/)); + + ASSERT_EQ(Get("a"), "val1"); + ASSERT_EQ(Get("b"), "val2"); + ASSERT_EQ(Get("c"), "NOT_FOUND"); + ASSERT_EQ(Get("d"), "val4"); + + VerifyResumeBytes(); +} + +// Cancel compaction right before processing key "c" to test resumption at a +// deletion at the ottom level. When resumed, compaction will continue from +// the last saved progress point before the delete. +TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithDeleteAtBottomLevel) { + Options options = CurrentOptions(); + + SetupResumableCompactionService(options, "c"); + + ASSERT_OK(Put("c", "old_value")); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Delete("c")); + ASSERT_OK(Flush()); + MoveFilesToLevel(options.num_levels - 1); + + ASSERT_OK(Put("a", "val1")); + ASSERT_OK(Put("b", "val2")); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + ResetStatistics(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_EQ(Get("a"), "val1"); + ASSERT_EQ(Get("b"), "val2"); + ASSERT_EQ(Get("c"), "NOT_FOUND"); + ASSERT_EQ(Get("c", snapshot), "old_value"); + ASSERT_EQ(Get("d"), "val4"); + db_->ReleaseSnapshot(snapshot); + + VerifyResumeBytes(); +} + +// Cancel compaction right before processing key "c" to test resumption at a +// merge operand. When resumed, compaction will continue from the last saved +// progress point before the merge operand. +TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithMerge) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + + SetupResumableCompactionService(options, "c"); + + ASSERT_OK(Put("c", "old_value")); + ASSERT_OK(Put("c_placeholder", "placeholder")); + ASSERT_OK(Flush()); + MoveFilesToLevel(options.num_levels - 1); + + ASSERT_OK(Put("a", "val1")); + ASSERT_OK(Put("b", "val2")); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("c", "new_value")); + ASSERT_OK(Flush()); + + std::vector input_files; + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + + for (const auto& file : cf_meta.levels[0].files) { + input_files.push_back(file.name); + } + + ASSERT_EQ(input_files.size(), 2); + + ResetStatistics(); + + CompactionOptions compact_options; + ASSERT_OK( + db_->CompactFiles(compact_options, input_files, 1 /* output_level*/)); + + ASSERT_EQ(Get("a"), "val1"); + ASSERT_EQ(Get("b"), "val2"); + ASSERT_EQ(Get("c"), "old_value,new_value"); + ASSERT_EQ(Get("d"), "val4"); + + VerifyResumeBytes(); +} + +// Cancel compaction right before processing key "c" to test resumption at a +// single delete. When resumed, compaction will continue from the last saved +// progress point before the single delete. +TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithSingleDelete) { + Options options = CurrentOptions(); + + SetupResumableCompactionService(options, "c"); + + ASSERT_OK(Put("c", "old_value")); + ASSERT_OK(Put("c_placeholder", "placeholder")); + ASSERT_OK(Flush()); + MoveFilesToLevel(options.num_levels - 1); + + ASSERT_OK(Put("a", "val1")); + ASSERT_OK(Put("b", "val2")); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + ASSERT_OK(SingleDelete("c")); + ASSERT_OK(Flush()); + + std::vector input_files; + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + + for (const auto& file : cf_meta.levels[0].files) { + input_files.push_back(file.name); + } + + ASSERT_EQ(input_files.size(), 2); + + ResetStatistics(); + + CompactionOptions compact_options; + ASSERT_OK( + db_->CompactFiles(compact_options, input_files, 1 /* output_level*/)); + + ASSERT_EQ(Get("a"), "val1"); + ASSERT_EQ(Get("b"), "val2"); + ASSERT_EQ(Get("c"), "NOT_FOUND"); + ASSERT_EQ(Get("d"), "val4"); + + VerifyResumeBytes(); +} + +// Cancel compaction right before processing key "c" to test resumption at a +// range delete. When resumed, compaction will continue from the last saved +// progress point before the range delete. +TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithRangeDelete) { + Options options = CurrentOptions(); + + SetupResumableCompactionService(options, "c"); + + ASSERT_OK(Put("c", "old_value")); + ASSERT_OK(Put("c_placeholder", "placeholder")); + ASSERT_OK(Flush()); + MoveFilesToLevel(options.num_levels - 1); + + ASSERT_OK(Put("a", "val1")); + ASSERT_OK(Put("b", "val2")); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "c", "c_")); + ASSERT_OK(Flush()); + + std::vector input_files; + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + + for (const auto& file : cf_meta.levels[0].files) { + input_files.push_back(file.name); + } + + ASSERT_EQ(input_files.size(), 2); + + ResetStatistics(); + + CompactionOptions compact_options; + ASSERT_OK( + db_->CompactFiles(compact_options, input_files, 1 /* output_level*/)); + + ASSERT_EQ(Get("a"), "val1"); + ASSERT_EQ(Get("b"), "val2"); + ASSERT_EQ(Get("c"), "NOT_FOUND"); + ASSERT_EQ(Get("d"), "val4"); + + VerifyResumeBytes(); +} + +// Test resumption when a key has multiple versions spanning across file +// boundaries (i.e., the same key exists in multiple SST files). +// +// Scenario: +// File 1 largest key: key "b" +// File 2 smallest key: key "c" with seqno=4 (older version) +// File 3 largest key: key "c" with seqno=5 (newer version) +// +// Cancel compaction right before processing the older version of key "c". +// Upon resumption, compaction continues from the saved progress point "b" and +// correctly processes both versions +TEST_F(ResumableCompactionKeyTypeTest, + CancelAndResumeWithKeySpanningFileBoundaries) { + Options options = CurrentOptions(); + + // Set up cancellation at the older version of the key which will have + // sequence number zero-ed out + SetupResumableCompactionService(options, "c" /*cancel_at_key*/, 0 /*seqno*/); + + ASSERT_OK(Put("a", "val1")); + ASSERT_OK(Put("b", "val2")); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("c", "old_value")); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put("c", "new_value")); + ASSERT_OK(Flush()); + + ResetStatistics(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_EQ(Get("a"), "val1"); + ASSERT_EQ(Get("b"), "val2"); + ASSERT_EQ(Get("c"), "new_value"); + ASSERT_EQ(Get("c", snapshot), "old_value"); + ASSERT_EQ(Get("d"), "val4"); + db_->ReleaseSnapshot(snapshot); + + VerifyResumeBytes(); +} + +// Cancel compaction right before processing key "c" to test resumption at a +// wide column. When resumed, compaction will continue +// from the wide column. +TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithWideColumn) { + Options options = CurrentOptions(); + + SetupResumableCompactionService(options, "c" /*cancel_at_key*/); + + ASSERT_OK(Put("a", "val1")); + ASSERT_OK(Put("b", "val2")); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + WideColumns columns{{"col1", "value1"}, {"col2", "value2"}}; + ASSERT_OK( + db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), "c", columns)); + ASSERT_OK(Flush()); + + ResetStatistics(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_EQ(Get("a"), "val1"); + ASSERT_EQ(Get("b"), "val2"); + + PinnableWideColumns result; + ASSERT_OK( + db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), "c", &result)); + WideColumns expected{{"col1", "value1"}, {"col2", "value2"}}; + ASSERT_EQ(result.columns(), expected); + + ASSERT_EQ(Get("d"), "val4"); + + VerifyResumeBytes(); +} + +// Cancel compaction right before processing key "c" to test resumption at a +// timed put. When resumed, compaction will continue +// from the timed put. +TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithTimedPut) { + Options options = CurrentOptions(); + options.preclude_last_level_data_seconds = 86400; // Enable TimedPut feature + options.preserve_internal_time_seconds = 86400; // Preserve write time + + SetupResumableCompactionService(options, "c" /*cancel_at_key*/); + + ASSERT_OK(Put("c", "old_value")); + ASSERT_OK(Put("c_placeholder", "placeholder")); + ASSERT_OK(Flush()); + MoveFilesToLevel(options.num_levels - 1); + + ASSERT_OK(Put("a", "val1")); + ASSERT_OK(Put("b", "val2")); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + // Use TimedPut for key "c" with current write time + uint64_t write_time = env_->NowMicros() / 1000000; + ASSERT_OK(TimedPut("c", "val3", write_time /*write_unix_time*/)); + ASSERT_OK(Put("d", "val4")); + ASSERT_OK(Flush()); + + std::vector input_files; + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + + for (const auto& file : cf_meta.levels[0].files) { + input_files.push_back(file.name); + } + + ASSERT_EQ(input_files.size(), 2); + + ResetStatistics(); + + CompactionOptions compact_options; + ASSERT_OK( + db_->CompactFiles(compact_options, input_files, 1 /* output_level*/)); + + ASSERT_EQ(Get("a"), "val1"); + ASSERT_EQ(Get("b"), "val2"); + ASSERT_EQ(Get("c"), "val3"); + ASSERT_EQ(Get("d"), "val4"); + + VerifyResumeBytes(); +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/compaction/compaction_state.cc b/db/compaction/compaction_state.cc index bf016d04b694..febf2e01d1e0 100644 --- a/db/compaction/compaction_state.cc +++ b/db/compaction/compaction_state.cc @@ -36,11 +36,11 @@ Slice CompactionState::LargestUserKey() { } void CompactionState::AggregateCompactionStats( - InternalStats::CompactionStatsFull& compaction_stats, - CompactionJobStats& compaction_job_stats) { + InternalStats::CompactionStatsFull& internal_stats, + CompactionJobStats& job_stats) { for (const auto& sc : sub_compact_states) { - sc.AggregateCompactionOutputStats(compaction_stats); - compaction_job_stats.Add(sc.compaction_job_stats); + sc.AggregateCompactionOutputStats(internal_stats); + job_stats.Add(sc.compaction_job_stats); } } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_state.h b/db/compaction/compaction_state.h index cc5b66c68224..faad712b6ff5 100644 --- a/db/compaction/compaction_state.h +++ b/db/compaction/compaction_state.h @@ -29,8 +29,8 @@ class CompactionState { Status status; void AggregateCompactionStats( - InternalStats::CompactionStatsFull& compaction_stats, - CompactionJobStats& compaction_job_stats); + InternalStats::CompactionStatsFull& internal_stats, + CompactionJobStats& job_stats); explicit CompactionState(Compaction* c) : compaction(c) {} diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc index 13f40f63f0ca..0e8f673c1124 100644 --- a/db/compaction/subcompaction_state.cc +++ b/db/compaction/subcompaction_state.cc @@ -14,33 +14,32 @@ namespace ROCKSDB_NAMESPACE { void SubcompactionState::AggregateCompactionOutputStats( - InternalStats::CompactionStatsFull& compaction_stats) const { + InternalStats::CompactionStatsFull& internal_stats) const { // Outputs should be closed. By extension, any files created just for // range deletes have already been written also. assert(compaction_outputs_.HasBuilder() == false); - assert(penultimate_level_outputs_.HasBuilder() == false); + assert(proximal_level_outputs_.HasBuilder() == false); // FIXME: These stats currently include abandonned output files // assert(compaction_outputs_.stats_.num_output_files == // compaction_outputs_.outputs_.size()); - // assert(penultimate_level_outputs_.stats_.num_output_files == - // penultimate_level_outputs_.outputs_.size()); + // assert(proximal_level_outputs_.stats_.num_output_files == + // proximal_level_outputs_.outputs_.size()); - compaction_stats.stats.Add(compaction_outputs_.stats_); - if (penultimate_level_outputs_.HasOutput()) { - compaction_stats.has_penultimate_level_output = true; - compaction_stats.penultimate_level_stats.Add( - penultimate_level_outputs_.stats_); + internal_stats.output_level_stats.Add(compaction_outputs_.stats_); + if (proximal_level_outputs_.HasOutput()) { + internal_stats.has_proximal_level_output = true; + internal_stats.proximal_level_stats.Add(proximal_level_outputs_.stats_); } } OutputIterator SubcompactionState::GetOutputs() const { - return OutputIterator(penultimate_level_outputs_.outputs_, + return OutputIterator(proximal_level_outputs_.outputs_, compaction_outputs_.outputs_); } void SubcompactionState::Cleanup(Cache* cache) { - penultimate_level_outputs_.Cleanup(); + proximal_level_outputs_.Cleanup(); compaction_outputs_.Cleanup(); if (!status.ok()) { @@ -63,9 +62,9 @@ void SubcompactionState::Cleanup(Cache* cache) { } Slice SubcompactionState::SmallestUserKey() const { - if (penultimate_level_outputs_.HasOutput()) { + if (proximal_level_outputs_.HasOutput()) { Slice a = compaction_outputs_.SmallestUserKey(); - Slice b = penultimate_level_outputs_.SmallestUserKey(); + Slice b = proximal_level_outputs_.SmallestUserKey(); if (a.empty()) { return b; } @@ -85,9 +84,9 @@ Slice SubcompactionState::SmallestUserKey() const { } Slice SubcompactionState::LargestUserKey() const { - if (penultimate_level_outputs_.HasOutput()) { + if (proximal_level_outputs_.HasOutput()) { Slice a = compaction_outputs_.LargestUserKey(); - Slice b = penultimate_level_outputs_.LargestUserKey(); + Slice b = proximal_level_outputs_.LargestUserKey(); if (a.empty()) { return b; } @@ -107,13 +106,15 @@ Slice SubcompactionState::LargestUserKey() const { } Status SubcompactionState::AddToOutput( - const CompactionIterator& iter, bool use_penultimate_output, + const CompactionIterator& iter, bool use_proximal_output, const CompactionFileOpenFunc& open_file_func, - const CompactionFileCloseFunc& close_file_func) { + const CompactionFileCloseFunc& close_file_func, + const ParsedInternalKey& prev_iter_output_internal_key) { // update target output - current_outputs_ = use_penultimate_output ? &penultimate_level_outputs_ - : &compaction_outputs_; - return current_outputs_->AddToOutput(iter, open_file_func, close_file_func); + current_outputs_ = + use_proximal_output ? &proximal_level_outputs_ : &compaction_outputs_; + return current_outputs_->AddToOutput(iter, open_file_func, close_file_func, + prev_iter_output_internal_key); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h index 6a28f74d9089..38785f9ae085 100644 --- a/db/compaction/subcompaction_state.h +++ b/db/compaction/subcompaction_state.h @@ -26,13 +26,13 @@ namespace ROCKSDB_NAMESPACE { // Maintains state and outputs for each sub-compaction // It contains 2 `CompactionOutputs`: // 1. one for the normal output files -// 2. another for the penultimate level outputs +// 2. another for the proximal level outputs // a `current` pointer maintains the current output group, when calling // `AddToOutput()`, it checks the output of the current compaction_iterator key // and point `current` to the target output group. By default, it just points to // normal compaction_outputs, if the compaction_iterator key should be placed on -// the penultimate level, `current` is changed to point to -// `penultimate_level_outputs`. +// the proximal level, `current` is changed to point to +// `proximal_level_outputs`. // The later operations uses `Current()` to get the target group. // // +----------+ +-----------------------------+ +---------+ @@ -43,7 +43,7 @@ namespace ROCKSDB_NAMESPACE { // | | ... | // | // | +-----------------------------+ +---------+ -// +-------------> | penultimate_level_outputs |----->| output | +// +-------------> | proximal_level_outputs |----->| output | // +-----------------------------+ +---------+ // | ... | @@ -78,7 +78,7 @@ class SubcompactionState { Slice LargestUserKey() const; // Get all outputs from the subcompaction. For per_key_placement compaction, - // it returns both the last level outputs and penultimate level outputs. + // it returns both the last level outputs and proximal level outputs. OutputIterator GetOutputs() const; // Assign range dels aggregator. The various tombstones will potentially @@ -92,7 +92,15 @@ class SubcompactionState { void RemoveLastEmptyOutput() { compaction_outputs_.RemoveLastEmptyOutput(); - penultimate_level_outputs_.RemoveLastEmptyOutput(); + proximal_level_outputs_.RemoveLastEmptyOutput(); + } + + // Cleanup output builders for abandoning in-progress files. + void CleanupOutputs() { + compaction_outputs_.Cleanup(); + if (compaction->SupportsPerKeyPlacement()) { + proximal_level_outputs_.Cleanup(); + } } void BuildSubcompactionJobInfo( @@ -106,7 +114,11 @@ class SubcompactionState { subcompaction_job_info.subcompaction_job_id = static_cast(sub_job_id); subcompaction_job_info.base_input_level = c->start_level(); subcompaction_job_info.output_level = c->output_level(); + subcompaction_job_info.compaction_reason = c->compaction_reason(); + subcompaction_job_info.compression = c->output_compression(); subcompaction_job_info.stats = compaction_job_stats; + subcompaction_job_info.blob_compression_type = + c->mutable_cf_options().blob_compression_type; } SubcompactionState() = delete; @@ -119,14 +131,14 @@ class SubcompactionState { start(_start), end(_end), sub_job_id(_sub_job_id), - compaction_outputs_(c, /*is_penultimate_level=*/false), - penultimate_level_outputs_(c, /*is_penultimate_level=*/true) { + compaction_outputs_(c, /*is_proximal_level=*/false), + proximal_level_outputs_(c, /*is_proximal_level=*/true) { assert(compaction != nullptr); // Set output split key (used for RoundRobin feature) only for normal - // compaction_outputs, output to penultimate_level feature doesn't support + // compaction_outputs, output to proximal_level feature doesn't support // RoundRobin feature (and may never going to be supported, because for // RoundRobin, the data time is mostly naturally sorted, no need to have - // per-key placement with output_to_penultimate_level). + // per-key placement with output_to_proximal_level). compaction_outputs_.SetOutputSlitKey(start, end); } @@ -141,18 +153,17 @@ class SubcompactionState { compaction_job_stats(std::move(state.compaction_job_stats)), sub_job_id(state.sub_job_id), compaction_outputs_(std::move(state.compaction_outputs_)), - penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)), + proximal_level_outputs_(std::move(state.proximal_level_outputs_)), range_del_agg_(std::move(state.range_del_agg_)) { - current_outputs_ = - state.current_outputs_ == &state.penultimate_level_outputs_ - ? &penultimate_level_outputs_ - : &compaction_outputs_; + current_outputs_ = state.current_outputs_ == &state.proximal_level_outputs_ + ? &proximal_level_outputs_ + : &compaction_outputs_; } // Add all the new files from this compaction to version_edit void AddOutputsEdit(VersionEdit* out_edit) const { - for (const auto& file : penultimate_level_outputs_.outputs_) { - out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta); + for (const auto& file : proximal_level_outputs_.outputs_) { + out_edit->AddFile(compaction->GetProximalLevel(), file.meta); } for (const auto& file : compaction_outputs_.outputs_) { out_edit->AddFile(compaction->output_level(), file.meta); @@ -162,13 +173,40 @@ class SubcompactionState { void Cleanup(Cache* cache); void AggregateCompactionOutputStats( - InternalStats::CompactionStatsFull& compaction_stats) const; + InternalStats::CompactionStatsFull& internal_stats) const; CompactionOutputs& Current() const { assert(current_outputs_); return *current_outputs_; } + CompactionOutputs* Outputs(bool is_proximal_level) { + assert(compaction); + if (is_proximal_level) { + assert(compaction->SupportsPerKeyPlacement()); + return &proximal_level_outputs_; + } + return &compaction_outputs_; + } + + // Per-level stats for the output + InternalStats::CompactionStats* OutputStats(bool is_proximal_level) { + assert(compaction); + if (is_proximal_level) { + assert(compaction->SupportsPerKeyPlacement()); + return &proximal_level_outputs_.stats_; + } + return &compaction_outputs_.stats_; + } + + uint64_t GetWorkerCPUMicros() const { + uint64_t rv = compaction_outputs_.GetWorkerCPUMicros(); + if (compaction->SupportsPerKeyPlacement()) { + rv += proximal_level_outputs_.GetWorkerCPUMicros(); + } + return rv; + } + CompactionRangeDelAggregator* RangeDelAgg() const { return range_del_agg_.get(); } @@ -178,13 +216,22 @@ class SubcompactionState { return range_del_agg_ && !range_del_agg_->IsEmpty(); } + void SetSubcompactionProgress( + const SubcompactionProgress& subcompaction_progress) { + subcompaction_progress_ = subcompaction_progress; + } + + SubcompactionProgress& GetSubcompactionProgressRef() { + return subcompaction_progress_; + } + // Add compaction_iterator key/value to the `Current` output group. - Status AddToOutput(const CompactionIterator& iter, - bool use_penultimate_output, + Status AddToOutput(const CompactionIterator& iter, bool use_proximal_output, const CompactionFileOpenFunc& open_file_func, - const CompactionFileCloseFunc& close_file_func); + const CompactionFileCloseFunc& close_file_func, + const ParsedInternalKey& prev_iter_output_internal_key); - // Close all compaction output files, both output_to_penultimate_level outputs + // Close all compaction output files, both output_to_proximal_level outputs // and normal outputs. Status CloseCompactionFiles(const Status& curr_status, const CompactionFileOpenFunc& open_file_func, @@ -195,11 +242,11 @@ class SubcompactionState { // CloseOutput() may open new compaction output files. Status s = curr_status; if (per_key) { - s = penultimate_level_outputs_.CloseOutput( - s, range_del_agg_.get(), open_file_func, close_file_func); + s = proximal_level_outputs_.CloseOutput(s, range_del_agg_.get(), + open_file_func, close_file_func); } else { - assert(penultimate_level_outputs_.HasBuilder() == false); - assert(penultimate_level_outputs_.HasOutput() == false); + assert(proximal_level_outputs_.HasBuilder() == false); + assert(proximal_level_outputs_.HasOutput() == false); } s = compaction_outputs_.CloseOutput(s, range_del_agg_.get(), open_file_func, close_file_func); @@ -209,9 +256,11 @@ class SubcompactionState { private: // State kept for output being generated CompactionOutputs compaction_outputs_; - CompactionOutputs penultimate_level_outputs_; + CompactionOutputs proximal_level_outputs_; CompactionOutputs* current_outputs_ = &compaction_outputs_; std::unique_ptr range_del_agg_; + + SubcompactionProgress subcompaction_progress_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc index eed5cb936f06..7bd840e486d4 100644 --- a/db/compaction/tiered_compaction_test.cc +++ b/db/compaction/tiered_compaction_test.cc @@ -33,42 +33,13 @@ ConfigOptions GetStrictConfigOptions() { class TieredCompactionTest : public DBTestBase { public: TieredCompactionTest() - : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true), - kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1), - kBasicPerKeyPlacementCompStats( - CompactionReason::kUniversalSizeAmplification, 1), - kBasicFlushStats(CompactionReason::kFlush, 1) { - kBasicCompStats.micros = kHasValue; - kBasicCompStats.cpu_micros = kHasValue; - kBasicCompStats.bytes_read_non_output_levels = kHasValue; - kBasicCompStats.num_input_files_in_non_output_levels = kHasValue; - kBasicCompStats.num_input_records = kHasValue; - kBasicCompStats.num_dropped_records = kHasValue; - - kBasicPerLevelStats.num_output_records = kHasValue; - kBasicPerLevelStats.bytes_written = kHasValue; - kBasicPerLevelStats.num_output_files = kHasValue; - - kBasicPerKeyPlacementCompStats.micros = kHasValue; - kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue; - kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats); - - kBasicFlushStats.micros = kHasValue; - kBasicFlushStats.cpu_micros = kHasValue; - kBasicFlushStats.bytes_written = kHasValue; - kBasicFlushStats.num_output_files = kHasValue; - } + : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true) {} protected: - static constexpr uint8_t kHasValue = 1; - - InternalStats::CompactionStats kBasicCompStats; - InternalStats::CompactionStats kBasicPerKeyPlacementCompStats; - InternalStats::CompactionOutputsStats kBasicPerLevelStats; - InternalStats::CompactionStats kBasicFlushStats; - std::atomic_bool enable_per_key_placement = true; + CompactionJobStats job_stats; + void SetUp() override { SyncPoint::GetInstance()->SetCallBack( "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) { @@ -108,21 +79,36 @@ class TieredCompactionTest : public DBTestBase { // Verify the compaction stats, the stats are roughly compared void VerifyCompactionStats( - const std::vector& expect_stats, - const InternalStats::CompactionStats& expect_pl_stats) { + const std::vector& expected_stats, + const InternalStats::CompactionStats& expected_pl_stats, + size_t output_level, uint64_t num_input_range_del = 0) { const std::vector& stats = GetCompactionStats(); - const size_t kLevels = expect_stats.size(); + const size_t kLevels = expected_stats.size(); ASSERT_EQ(kLevels, stats.size()); + ASSERT_TRUE(output_level < kLevels); - for (auto it = stats.begin(), expect = expect_stats.begin(); - it != stats.end(); it++, expect++) { - VerifyCompactionStats(*it, *expect); + for (size_t level = 0; level < kLevels; level++) { + VerifyCompactionStats(stats[level], expected_stats[level]); } const InternalStats::CompactionStats& pl_stats = GetPerKeyPlacementCompactionStats(); - VerifyCompactionStats(pl_stats, expect_pl_stats); + VerifyCompactionStats(pl_stats, expected_pl_stats); + + const auto& output_level_stats = stats[output_level]; + CompactionJobStats expected_job_stats; + expected_job_stats.cpu_micros = output_level_stats.cpu_micros; + expected_job_stats.num_input_files = + output_level_stats.num_input_files_in_output_level + + output_level_stats.num_input_files_in_non_output_levels; + expected_job_stats.num_input_records = + output_level_stats.num_input_records - num_input_range_del; + expected_job_stats.num_output_files = + output_level_stats.num_output_files + pl_stats.num_output_files; + expected_job_stats.num_output_records = + output_level_stats.num_output_records + pl_stats.num_output_records; + VerifyCompactionJobStats(job_stats, expected_job_stats); } void ResetAllStats(std::vector& stats, @@ -139,42 +125,52 @@ class TieredCompactionTest : public DBTestBase { } private: - void CompareStats(uint64_t val, uint64_t expect) { - if (expect > 0) { - ASSERT_TRUE(val > 0); - } else { - ASSERT_EQ(val, 0); - } - } - void VerifyCompactionStats( const InternalStats::CompactionStats& stats, const InternalStats::CompactionStats& expect_stats) { - CompareStats(stats.micros, expect_stats.micros); - CompareStats(stats.cpu_micros, expect_stats.cpu_micros); - CompareStats(stats.bytes_read_non_output_levels, - expect_stats.bytes_read_non_output_levels); - CompareStats(stats.bytes_read_output_level, - expect_stats.bytes_read_output_level); - CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob); - CompareStats(stats.bytes_written, expect_stats.bytes_written); - CompareStats(stats.bytes_moved, expect_stats.bytes_moved); - CompareStats(stats.num_input_files_in_non_output_levels, - expect_stats.num_input_files_in_non_output_levels); - CompareStats(stats.num_input_files_in_output_level, - expect_stats.num_input_files_in_output_level); - CompareStats(stats.num_output_files, expect_stats.num_output_files); - CompareStats(stats.num_output_files_blob, - expect_stats.num_output_files_blob); - CompareStats(stats.num_input_records, expect_stats.num_input_records); - CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records); - CompareStats(stats.num_output_records, expect_stats.num_output_records); + ASSERT_EQ(stats.micros > 0, expect_stats.micros > 0); + ASSERT_EQ(stats.cpu_micros > 0, expect_stats.cpu_micros > 0); + + // Hard to get consistent byte sizes of SST files. + // Use ASSERT_NEAR for comparison + ASSERT_NEAR(stats.bytes_read_non_output_levels * 1.0f, + expect_stats.bytes_read_non_output_levels * 1.0f, + stats.bytes_read_non_output_levels * 0.5f); + ASSERT_NEAR(stats.bytes_read_output_level * 1.0f, + expect_stats.bytes_read_output_level * 1.0f, + stats.bytes_read_output_level * 0.5f); + ASSERT_NEAR(stats.bytes_read_blob * 1.0f, + expect_stats.bytes_read_blob * 1.0f, + stats.bytes_read_blob * 0.5f); + ASSERT_NEAR(stats.bytes_written * 1.0f, expect_stats.bytes_written * 1.0f, + stats.bytes_written * 0.5f); + + ASSERT_EQ(stats.bytes_moved, expect_stats.bytes_moved); + ASSERT_EQ(stats.num_input_files_in_non_output_levels, + expect_stats.num_input_files_in_non_output_levels); + ASSERT_EQ(stats.num_input_files_in_output_level, + expect_stats.num_input_files_in_output_level); + ASSERT_EQ(stats.num_output_files, expect_stats.num_output_files); + ASSERT_EQ(stats.num_output_files_blob, expect_stats.num_output_files_blob); + ASSERT_EQ(stats.num_input_records, expect_stats.num_input_records); + ASSERT_EQ(stats.num_dropped_records, expect_stats.num_dropped_records); + ASSERT_EQ(stats.num_output_records, expect_stats.num_output_records); + ASSERT_EQ(stats.count, expect_stats.count); for (int i = 0; i < static_cast(CompactionReason::kNumOfReasons); i++) { ASSERT_EQ(stats.counts[i], expect_stats.counts[i]); } } + + void VerifyCompactionJobStats(const CompactionJobStats& stats, + const CompactionJobStats& expected_stats) { + ASSERT_EQ(stats.cpu_micros, expected_stats.cpu_micros); + ASSERT_EQ(stats.num_input_files, expected_stats.num_input_files); + ASSERT_EQ(stats.num_input_records, expected_stats.num_input_records); + ASSERT_EQ(job_stats.num_output_files, expected_stats.num_output_files); + ASSERT_EQ(job_stats.num_output_records, expected_stats.num_output_records); + } }; TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { @@ -199,19 +195,39 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { [&](void* arg) { *static_cast(arg) = latest_cold_seq.load(); }); + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Install:AfterUpdateCompactionJobStats", [&](void* arg) { + job_stats.Reset(); + job_stats.Add(*(static_cast(arg))); + }); SyncPoint::GetInstance()->EnableProcessing(); std::vector expect_stats(kNumLevels); - InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; InternalStats::CompactionStats expect_pl_stats; + // Put keys in the following way to create overlaps + // First file from 0 ~ 99 + // Second file from 10 ~ 109 + // ... + size_t bytes_per_file = 1952; + uint64_t total_input_key_count = kNumTrigger * kNumKeys; + uint64_t total_output_key_count = 130; // 0 ~ 129 + for (int i = 0; i < kNumTrigger; i++) { for (int j = 0; j < kNumKeys; j++) { ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); } ASSERT_OK(Flush()); + seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); - expect_stats[0].Add(kBasicFlushStats); + InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, 1); + flush_stats.cpu_micros = 1; + flush_stats.micros = 1; + flush_stats.bytes_written = bytes_per_file; + flush_stats.num_output_files = 1; + flush_stats.num_input_records = kNumKeys; + flush_stats.num_output_records = kNumKeys; + expect_stats[0].Add(flush_stats); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -221,32 +237,97 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); - // basic compaction stats are still counted to the last level - expect_stats[kLastLevel].Add(kBasicCompStats); - expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); + uint64_t bytes_written_penultimate_level = + GetPerKeyPlacementCompactionStats().bytes_written; - VerifyCompactionStats(expect_stats, expect_pl_stats); + // TODO - Use designated initializer when c++20 support is required + { + InternalStats::CompactionStats last_level_compaction_stats( + CompactionReason::kUniversalSizeAmplification, 1); + last_level_compaction_stats.cpu_micros = 1; + last_level_compaction_stats.micros = 1; + last_level_compaction_stats.bytes_written = 0; + last_level_compaction_stats.bytes_read_non_output_levels = + bytes_per_file * kNumTrigger; + last_level_compaction_stats.num_input_files_in_non_output_levels = + kNumTrigger; + last_level_compaction_stats.num_input_records = total_input_key_count; + last_level_compaction_stats.num_dropped_records = + total_input_key_count - total_output_key_count; + last_level_compaction_stats.num_output_records = 0; + last_level_compaction_stats.num_output_files = 0; + expect_stats[kLastLevel].Add(last_level_compaction_stats); + } + { + InternalStats::CompactionStats penultimate_level_compaction_stats( + CompactionReason::kUniversalSizeAmplification, 1); + penultimate_level_compaction_stats.cpu_micros = 1; + penultimate_level_compaction_stats.micros = 1; + penultimate_level_compaction_stats.bytes_written = + bytes_written_penultimate_level; + penultimate_level_compaction_stats.num_output_files = 1; + penultimate_level_compaction_stats.num_output_records = + total_output_key_count; + expect_pl_stats.Add(penultimate_level_compaction_stats); + } + VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel); ResetAllStats(expect_stats, expect_pl_stats); // move forward the cold_seq to split the file into 2 levels, so should have - // both the last level stats and the output_to_penultimate_level stats + // both the last level stats and the penultimate level stats latest_cold_seq = seq_history[0]; ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); - last_stats.Add(kBasicCompStats); - last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - last_stats.Add(kBasicPerLevelStats); - last_stats.num_dropped_records = 0; - expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); - expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - VerifyCompactionStats(expect_stats, expect_pl_stats); + // Now update the input count to be the total count from the previous + total_input_key_count = total_output_key_count; + uint64_t moved_to_last_level_key_count = 10; - // delete all cold data, so all data will be on penultimate level + // bytes read in non output = bytes written in penultimate level from previous + uint64_t bytes_read_in_non_output_level = bytes_written_penultimate_level; + uint64_t bytes_written_output_level = + GetCompactionStats()[kLastLevel].bytes_written; + + // Now get the new bytes written in penultimate level + bytes_written_penultimate_level = + GetPerKeyPlacementCompactionStats().bytes_written; + { + InternalStats::CompactionStats last_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + last_level_compaction_stats.cpu_micros = 1; + last_level_compaction_stats.micros = 1; + last_level_compaction_stats.bytes_written = bytes_written_output_level; + last_level_compaction_stats.bytes_read_non_output_levels = + bytes_read_in_non_output_level; + last_level_compaction_stats.num_input_files_in_non_output_levels = 1; + last_level_compaction_stats.num_input_records = total_input_key_count; + last_level_compaction_stats.num_dropped_records = + total_input_key_count - total_output_key_count; + last_level_compaction_stats.num_output_records = + moved_to_last_level_key_count; + last_level_compaction_stats.num_output_files = 1; + expect_stats[kLastLevel].Add(last_level_compaction_stats); + } + { + InternalStats::CompactionStats penultimate_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + penultimate_level_compaction_stats.cpu_micros = 1; + penultimate_level_compaction_stats.micros = 1; + penultimate_level_compaction_stats.bytes_written = + bytes_written_penultimate_level; + penultimate_level_compaction_stats.num_output_files = 1; + penultimate_level_compaction_stats.num_output_records = + total_output_key_count - moved_to_last_level_key_count; + expect_pl_stats.Add(penultimate_level_compaction_stats); + } + VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel); + + // delete all cold data, so all data will be on proximal level for (int i = 0; i < 10; i++) { ASSERT_OK(Delete(Key(i))); } @@ -255,17 +336,54 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { ResetAllStats(expect_stats, expect_pl_stats); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); - last_stats.Add(kBasicCompStats); - last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - last_stats.bytes_read_output_level = kHasValue; - last_stats.num_input_files_in_output_level = kHasValue; - expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); - expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - VerifyCompactionStats(expect_stats, expect_pl_stats); + // 10 tombstones added + total_input_key_count = total_input_key_count + 10; + total_output_key_count = total_output_key_count - 10; + + auto last_level_stats = GetCompactionStats()[kLastLevel]; + bytes_written_penultimate_level = + GetPerKeyPlacementCompactionStats().bytes_written; + + ASSERT_LT(bytes_written_penultimate_level, + last_level_stats.bytes_read_non_output_levels + + last_level_stats.bytes_read_output_level); + { + InternalStats::CompactionStats last_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + last_level_compaction_stats.cpu_micros = 1; + last_level_compaction_stats.micros = 1; + last_level_compaction_stats.bytes_written = 0; + last_level_compaction_stats.bytes_read_non_output_levels = + last_level_stats.bytes_read_non_output_levels; + last_level_compaction_stats.bytes_read_output_level = + last_level_stats.bytes_read_output_level; + last_level_compaction_stats.num_input_files_in_non_output_levels = 2; + last_level_compaction_stats.num_input_files_in_output_level = 1; + last_level_compaction_stats.num_input_records = total_input_key_count; + last_level_compaction_stats.num_dropped_records = + total_input_key_count - total_output_key_count; + last_level_compaction_stats.num_output_records = 0; + last_level_compaction_stats.num_output_files = 0; + expect_stats[kLastLevel].Add(last_level_compaction_stats); + } + { + InternalStats::CompactionStats penultimate_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + penultimate_level_compaction_stats.cpu_micros = 1; + penultimate_level_compaction_stats.micros = 1; + penultimate_level_compaction_stats.bytes_written = + bytes_written_penultimate_level; + penultimate_level_compaction_stats.num_output_files = 1; + penultimate_level_compaction_stats.num_output_records = + total_output_key_count; + expect_pl_stats.Add(penultimate_level_compaction_stats); + } + VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel); // move forward the cold_seq again with range delete, take a snapshot to keep // the range dels in both cold and hot SSTs @@ -275,6 +393,7 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); ASSERT_OK(Flush()); + uint64_t num_input_range_del = 1; ResetAllStats(expect_stats, expect_pl_stats); @@ -283,12 +402,49 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); - last_stats.Add(kBasicCompStats); - last_stats.Add(kBasicPerLevelStats); - last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); - expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - VerifyCompactionStats(expect_stats, expect_pl_stats); + // Previous output + one delete range + total_input_key_count = total_output_key_count + num_input_range_del; + moved_to_last_level_key_count = 20; + + last_level_stats = GetCompactionStats()[kLastLevel]; + bytes_written_penultimate_level = + GetPerKeyPlacementCompactionStats().bytes_written; + // Expected to write more in last level + ASSERT_GT(bytes_written_penultimate_level, last_level_stats.bytes_written); + { + InternalStats::CompactionStats last_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + last_level_compaction_stats.cpu_micros = 1; + last_level_compaction_stats.micros = 1; + last_level_compaction_stats.bytes_written = last_level_stats.bytes_written; + last_level_compaction_stats.bytes_read_non_output_levels = + last_level_stats.bytes_read_non_output_levels; + last_level_compaction_stats.bytes_read_output_level = 0; + last_level_compaction_stats.num_input_files_in_non_output_levels = 2; + last_level_compaction_stats.num_input_files_in_output_level = 0; + last_level_compaction_stats.num_input_records = total_input_key_count; + last_level_compaction_stats.num_dropped_records = + num_input_range_del; // delete range tombstone + last_level_compaction_stats.num_output_records = + moved_to_last_level_key_count; + last_level_compaction_stats.num_output_files = 1; + expect_stats[kLastLevel].Add(last_level_compaction_stats); + } + { + InternalStats::CompactionStats penultimate_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + penultimate_level_compaction_stats.cpu_micros = 1; + penultimate_level_compaction_stats.micros = 1; + penultimate_level_compaction_stats.bytes_written = + bytes_written_penultimate_level; + penultimate_level_compaction_stats.num_output_files = 1; + penultimate_level_compaction_stats.num_output_records = + total_input_key_count - moved_to_last_level_key_count - + num_input_range_del; + expect_pl_stats.Add(penultimate_level_compaction_stats); + } + VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel, + num_input_range_del); // verify data std::string value; @@ -341,11 +497,11 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { // This test was essentially for a hacked-up version on future functionality. // It can be resurrected if/when a form of range-based tiering is properly // implemented. +// TODO - Add stats verification when adding this test back TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kNumKeys = 100; - const int kLastLevel = kNumLevels - 1; auto options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; @@ -364,14 +520,13 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) { "CompactionIterator::PrepareOutput.context", [&](void* arg) { auto context = static_cast(arg); MutexLock l(&mutex); - context->output_to_penultimate_level = + context->output_to_proximal_level = cmp->Compare(context->key, hot_start) >= 0 && cmp->Compare(context->key, hot_end) < 0; }); SyncPoint::GetInstance()->EnableProcessing(); std::vector expect_stats(kNumLevels); - InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; InternalStats::CompactionStats expect_pl_stats; for (int i = 0; i < kNumTrigger; i++) { @@ -379,21 +534,15 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) { ASSERT_OK(Put(Key(j), "value" + std::to_string(j))); } ASSERT_OK(Flush()); - expect_stats[0].Add(kBasicFlushStats); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); - last_stats.Add(kBasicCompStats); - last_stats.Add(kBasicPerLevelStats); - expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); - VerifyCompactionStats(expect_stats, expect_pl_stats); - ResetAllStats(expect_stats, expect_pl_stats); - // change to all cold, no output_to_penultimate_level output + // change to all cold, no output_to_proximal_level output { MutexLock l(&mutex); hot_start = Key(100); @@ -404,14 +553,6 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) { ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); - last_stats.Add(kBasicCompStats); - last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - last_stats.Add(kBasicPerLevelStats); - last_stats.num_dropped_records = 0; - last_stats.bytes_read_output_level = kHasValue; - last_stats.num_input_files_in_output_level = kHasValue; - VerifyCompactionStats(expect_stats, expect_pl_stats); - // change to all hot, universal compaction support moving data to up level if // it's within compaction level range. { @@ -421,7 +562,7 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) { } // No data is moved from cold tier to hot tier because no input files from L5 - // or higher, it's not safe to move data to output_to_penultimate_level level. + // or higher, it's not safe to move data to output_to_proximal_level level. ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); @@ -567,7 +708,7 @@ TEST_F(TieredCompactionTest, LevelColdRangeDelete) { // 20->30 will be marked as cold data, but it cannot be placed to cold tier // (bottommost) otherwise, it will be "deleted" by the range del in - // output_to_penultimate_level level verify that these data will be able to + // output_to_proximal_level level verify that these data will be able to // queried for (int i = 20; i < 30; i++) { ASSERT_OK(Put(Key(i), "value" + std::to_string(i))); @@ -677,17 +818,17 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { std::vector> level_to_files; dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), &level_to_files); - // range tombstone is in the penultimate level - const int penultimate_level = kNumLevels - 2; - ASSERT_EQ(level_to_files[penultimate_level].size(), 1); - ASSERT_EQ(level_to_files[penultimate_level][0].num_entries, 1); - ASSERT_EQ(level_to_files[penultimate_level][0].num_deletions, 1); - ASSERT_EQ(level_to_files[penultimate_level][0].temperature, + // range tombstone is in the proximal level + const int proximal_level = kNumLevels - 2; + ASSERT_EQ(level_to_files[proximal_level].size(), 1); + ASSERT_EQ(level_to_files[proximal_level][0].num_entries, 1); + ASSERT_EQ(level_to_files[proximal_level][0].num_deletions, 1); + ASSERT_EQ(level_to_files[proximal_level][0].temperature, Temperature::kUnknown); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_EQ("0,1,10", - FilesPerLevel()); // one file is at the penultimate level which + FilesPerLevel()); // one file is at the proximal level which // only contains a range delete // Add 2 hot keys, each is a new SST, they will be placed in the same level as @@ -701,7 +842,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ("0,2,10", - FilesPerLevel()); // one file is at the penultimate level + FilesPerLevel()); // one file is at the proximal level // which only contains a range delete std::vector live_file_meta; db_->GetLiveFilesMetaData(&live_file_meta); @@ -711,7 +852,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { if (meta.num_deletions > 0) { // found SST with del, which has 2 entries, one for data one for range del ASSERT_EQ(meta.level, - kNumLevels - 2); // output to penultimate level + kNumLevels - 2); // output to proximal level ASSERT_EQ(meta.num_entries, 2); ASSERT_EQ(meta.num_deletions, 1); found_sst_with_del = true; @@ -722,7 +863,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { // release the first snapshot and compact, which should compact the range del // but new inserted key `0` and `6` are still hot data which will be placed on - // the penultimate level + // the proximal level db_->ReleaseSnapshot(snap); ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ("0,2,7", FilesPerLevel()); @@ -738,7 +879,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { ASSERT_FALSE(found_sst_with_del); // Now make all data cold, key 0 will be moved to the last level, but key 6 is - // still in snap2, so it will be kept at the penultimate level + // still in snap2, so it will be kept at the proximal level latest_cold_seq = dbfull()->GetLatestSequenceNumber(); ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ("0,1,8", FilesPerLevel()); @@ -783,7 +924,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) { } ASSERT_OK(Flush()); - // compact to the penultimate level with 10 files + // compact to the proximal level with 10 files CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); @@ -810,7 +951,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) { ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel()); - // range del with snapshot should be preserved in the penultimate level + // range del with snapshot should be preserved in the proximal level auto snap = db_->GetSnapshot(); start = Key(6); @@ -841,7 +982,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) { if (meta.num_deletions > 0) { // found SST with del, which has 2 entries, one for data one for range del ASSERT_EQ(meta.level, - kNumLevels - 2); // output_to_penultimate_level level + kNumLevels - 2); // output_to_proximal_level level ASSERT_EQ(meta.num_entries, 2); ASSERT_EQ(meta.num_deletions, 1); found_sst_with_del = true; @@ -890,6 +1031,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) { const int kNumKeys = 100; const int kLastLevel = kNumLevels - 1; + int output_level = 0; + auto options = CurrentOptions(); SetColdTemperature(options); options.level0_file_num_compaction_trigger = kNumTrigger; @@ -906,18 +1049,42 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) { [&](void* arg) { *static_cast(arg) = latest_cold_seq.load(); }); + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Install:AfterUpdateCompactionJobStats", [&](void* arg) { + job_stats.Reset(); + job_stats.Add(*(static_cast(arg))); + }); + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) { + auto compaction = static_cast(arg); + output_level = compaction->output_level(); + }); SyncPoint::GetInstance()->EnableProcessing(); std::vector expect_stats(kNumLevels); - InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel]; InternalStats::CompactionStats expect_pl_stats; + // Put keys in the following way to create overlaps + // First file from 0 ~ 99 + // Second file from 10 ~ 109 + // ... + size_t bytes_per_file = 1952; + uint64_t total_input_key_count = kNumTrigger * kNumKeys; + uint64_t total_output_key_count = 130; // 0 ~ 129 + for (int i = 0; i < kNumTrigger; i++) { for (int j = 0; j < kNumKeys; j++) { ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i))); } ASSERT_OK(Flush()); - expect_stats[0].Add(kBasicFlushStats); + InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, 1); + flush_stats.cpu_micros = 1; + flush_stats.micros = 1; + flush_stats.bytes_written = bytes_per_file; + flush_stats.num_output_files = 1; + flush_stats.num_input_records = kNumKeys; + flush_stats.num_output_records = kNumKeys; + expect_stats[0].Add(flush_stats); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -926,10 +1093,30 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) { ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); - expect_stats[1].Add(kBasicCompStats); - expect_stats[1].Add(kBasicPerLevelStats); - expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum); - VerifyCompactionStats(expect_stats, expect_pl_stats); + uint64_t bytes_written_output_level = + GetCompactionStats()[output_level].bytes_written; + ASSERT_GT(bytes_written_output_level, 0); + + { + InternalStats::CompactionStats output_level_compaction_stats( + CompactionReason::kLevelL0FilesNum, 1); + output_level_compaction_stats.cpu_micros = 1; + output_level_compaction_stats.micros = 1; + output_level_compaction_stats.bytes_written = bytes_written_output_level; + output_level_compaction_stats.bytes_read_non_output_levels = + bytes_per_file * kNumTrigger; + output_level_compaction_stats.bytes_read_output_level = 0; + output_level_compaction_stats.num_input_files_in_non_output_levels = + kNumTrigger; + output_level_compaction_stats.num_input_files_in_output_level = 0; + output_level_compaction_stats.num_input_records = total_input_key_count; + output_level_compaction_stats.num_dropped_records = + total_input_key_count - total_output_key_count; + output_level_compaction_stats.num_output_records = total_output_key_count; + output_level_compaction_stats.num_output_files = 1; + expect_stats[output_level].Add(output_level_compaction_stats); + } + VerifyCompactionStats(expect_stats, expect_pl_stats, output_level); // move all data to the last level MoveFilesToLevel(kLastLevel); @@ -944,15 +1131,26 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) { ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); - last_stats.Add(kBasicCompStats); - last_stats.Add(kBasicPerLevelStats); - last_stats.num_dropped_records = 0; - last_stats.bytes_read_non_output_levels = 0; - last_stats.num_input_files_in_non_output_levels = 0; - last_stats.bytes_read_output_level = kHasValue; - last_stats.num_input_files_in_output_level = kHasValue; - last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - VerifyCompactionStats(expect_stats, expect_pl_stats); + total_input_key_count = total_output_key_count; + { + InternalStats::CompactionStats output_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + output_level_compaction_stats.cpu_micros = 1; + output_level_compaction_stats.micros = 1; + output_level_compaction_stats.bytes_written = bytes_written_output_level; + output_level_compaction_stats.bytes_read_non_output_levels = 0; + output_level_compaction_stats.bytes_read_output_level = + bytes_written_output_level; + output_level_compaction_stats.num_input_files_in_non_output_levels = 0; + output_level_compaction_stats.num_input_files_in_output_level = 1; + output_level_compaction_stats.num_input_records = total_input_key_count; + output_level_compaction_stats.num_dropped_records = + total_input_key_count - total_output_key_count; + output_level_compaction_stats.num_output_records = total_output_key_count; + output_level_compaction_stats.num_output_files = 1; + expect_stats[output_level].Add(output_level_compaction_stats); + } + VerifyCompactionStats(expect_stats, expect_pl_stats, output_level); // Add new data, which is all hot and overriding all existing data latest_cold_seq = dbfull()->GetLatestSequenceNumber(); @@ -976,17 +1174,47 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) { ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); + uint64_t bytes_written_in_proximal_level = + GetPerKeyPlacementCompactionStats().bytes_written; for (int level = 2; level < kNumLevels - 1; level++) { - expect_stats[level].bytes_moved = kHasValue; + expect_stats[level].bytes_moved = bytes_written_in_proximal_level; } - last_stats.Add(kBasicCompStats); - last_stats.bytes_read_output_level = kHasValue; - last_stats.num_input_files_in_output_level = kHasValue; - last_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - expect_pl_stats.Add(kBasicPerKeyPlacementCompStats); - expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction); - VerifyCompactionStats(expect_stats, expect_pl_stats); + // Another set of 130 keys + from the previous + total_input_key_count = total_output_key_count + 130; + // Merged into 130 + total_output_key_count = 130; + + { + InternalStats::CompactionStats output_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + output_level_compaction_stats.cpu_micros = 1; + output_level_compaction_stats.micros = 1; + output_level_compaction_stats.bytes_written = 0; + output_level_compaction_stats.bytes_read_non_output_levels = + bytes_written_in_proximal_level; + output_level_compaction_stats.bytes_read_output_level = + bytes_written_output_level; + output_level_compaction_stats.num_input_files_in_non_output_levels = 1; + output_level_compaction_stats.num_input_files_in_output_level = 1; + output_level_compaction_stats.num_input_records = total_input_key_count; + output_level_compaction_stats.num_dropped_records = + total_input_key_count - total_output_key_count; + output_level_compaction_stats.num_output_records = 0; + output_level_compaction_stats.num_output_files = 0; + expect_stats[output_level].Add(output_level_compaction_stats); + } + { + InternalStats::CompactionStats proximal_level_compaction_stats( + CompactionReason::kManualCompaction, 1); + expect_pl_stats.cpu_micros = 1; + expect_pl_stats.micros = 1; + expect_pl_stats.bytes_written = bytes_written_in_proximal_level; + expect_pl_stats.num_output_files = 1; + expect_pl_stats.num_output_records = total_output_key_count; + expect_pl_stats.Add(proximal_level_compaction_stats); + } + VerifyCompactionStats(expect_stats, expect_pl_stats, output_level); // move forward the cold_seq, try to split the data into cold and hot, but in // this case it's unsafe to split the data @@ -1138,7 +1366,7 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageLevel) { "CompactionIterator::PrepareOutput.context", [&](void* arg) { auto context = static_cast(arg); MutexLock l(&mutex); - context->output_to_penultimate_level = + context->output_to_proximal_level = cmp->Compare(context->key, hot_start) >= 0 && cmp->Compare(context->key, hot_end) < 0; }); @@ -1221,10 +1449,10 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageLevel) { options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), 1); - // Tests that we only compact keys up to penultimate level - // that are within penultimate level input's internal key range. - // UPDATE: this functionality has changed. With penultimate-enabled - // compaction, the expanded potential output range in the penultimate + // Tests that we only compact keys up to proximal level + // that are within proximal level input's internal key range. + // UPDATE: this functionality has changed. With proximal-enabled + // compaction, the expanded potential output range in the proximal // level is reserved so should be safe to use. { MutexLock l(&mutex); @@ -1376,7 +1604,7 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) { cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - // all data is moved up to the penultimate level + // all data is moved up to the proximal level ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); @@ -1448,7 +1676,7 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); } - // all data is moved up to the penultimate level + // all data is moved up to the proximal level ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); @@ -1489,9 +1717,8 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) { ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); std::vector key_versions; - ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), - std::numeric_limits::max(), - &key_versions)); + ASSERT_OK(GetAllKeyVersions( + db_.get(), {}, {}, std::numeric_limits::max(), &key_versions)); // make sure there're more than 300 keys and first 100 keys are having seqno // zeroed out, the last 100 key seqno not zeroed out @@ -1537,7 +1764,10 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) { options.env = mock_env_.get(); options.level0_file_num_compaction_trigger = kNumTrigger; options.num_levels = kNumLevels; - options.last_level_temperature = Temperature::kCold; + // This existing test selected to also check the case of various temperatures + // for last_level_temperature, which should not be interesting enough to + // exercise across many/all test cases + options.last_level_temperature = RandomKnownTemperature(); DestroyAndReopen(options); Random rnd(301); @@ -1564,6 +1794,10 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) { ASSERT_FALSE(tp_mapping.Empty()); auto seqs = tp_mapping.TEST_GetInternalMapping(); ASSERT_FALSE(seqs.empty()); + ASSERT_GE(GetSstSizeHelper(Temperature::kUnknown), 1); + for (auto t : kKnownTemperatures) { + ASSERT_EQ(GetSstSizeHelper(t), 0); + } // Wait more than preclude_last_level time, then make sure all the data is // compacted to the last level even there's no write (no seqno -> time @@ -1572,16 +1806,22 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) { ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); - ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); - ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); + + for (auto t : kKnownTemperatures) { + if (t == options.last_level_temperature) { + ASSERT_GT(GetSstSizeHelper(t), 0); + } else { + ASSERT_EQ(GetSstSizeHelper(t), 0); + } + } Close(); } TEST_P(PrecludeLastLevelTest, CheckInternalKeyRange) { - // When compacting keys from the last level to penultimate level, - // output to penultimate level should be within internal key range - // of input files from penultimate level. + // When compacting keys from the last level to proximal level, + // output to proximal level should be within internal key range + // of input files from proximal level. // Set up: // L5: // File 1: DeleteRange[1, 3)@4, File 2: [3@5, 100@6] @@ -1719,8 +1959,8 @@ TEST_P(PrecludeWithCompactStyleTest, RangeTombstoneSnapshotMigrateFromLast) { ApplyConfigChange(&options, {{"preclude_last_level_data_seconds", "10000"}}); - // To exercise the WithinPenultimateLevelOutputRange feature, we want files - // around the middle file to be compacted on the penultimate level + // To exercise the WithinProximalLevelOutputRange feature, we want files + // around the middle file to be compacted on the proximal level ASSERT_OK(Put(Key(0), "val0")); ASSERT_OK(Flush()); ASSERT_OK(Put(Key(3), "val3")); @@ -1777,9 +2017,9 @@ TEST_P(PrecludeWithCompactStyleTest, RangeTombstoneSnapshotMigrateFromLast) { EXPECT_EQ("0,0,0,0,0,3,1", FilesPerLevel()); VerifyLogicalState(__LINE__); - // Compact everything, but some data still goes to both penultimate and last + // Compact everything, but some data still goes to both proximal and last // levels. A full-range compaction should be safe to "migrate" data from the - // last level to penultimate (because of preclude setting change). + // last level to proximal (because of preclude setting change). ASSERT_OK(CompactRange({}, {}, {})); EXPECT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); VerifyLogicalState(__LINE__); @@ -1898,7 +2138,7 @@ TEST_P(TimedPutPrecludeLastLevelTest, InterleavedTimedPutAndPut) { Close(); } -TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnPenultimateLevel) { +TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnProximalLevel) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.disable_auto_compactions = true; @@ -1924,14 +2164,14 @@ TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnPenultimateLevel) { ASSERT_OK(TimedPut(0, Key(2), "v2", kMockStartTime - 1 * 24 * 60 * 60, wo)); ASSERT_OK(Flush()); - // Should still be in penultimate level. + // Should still be in proximal level. ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); // Wait one more day and release snapshot. Data's preferred seqno should be - // swapped in, but data should still stay in penultimate level. SST file's + // swapped in, but data should still stay in proximal level. SST file's // seqno to time mapping should continue to cover preferred seqno after // compaction. db_->ReleaseSnapshot(snap1); @@ -2079,9 +2319,8 @@ TEST_P(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) { ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); std::vector key_versions; - ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), - std::numeric_limits::max(), - &key_versions)); + ASSERT_OK(GetAllKeyVersions( + db_.get(), {}, {}, std::numeric_limits::max(), &key_versions)); // make sure there're more than 300 keys and first 100 keys are having seqno // zeroed out, the last 100 key seqno not zeroed out @@ -2253,13 +2492,13 @@ TEST_P(PrecludeLastLevelOptionalTest, LastLevelOnlyCompactionNoPreclude) { Close(); } -TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) { +TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToProximalLevel) { // Test the last level only periodic compaction should also be blocked by an - // ongoing compaction in penultimate level if tiered compaction is enabled + // ongoing compaction in proximal level if tiered compaction is enabled // otherwise, the periodic compaction should just run for the last level. const int kNumTrigger = 4; const int kNumLevels = 7; - const int kPenultimateLevel = kNumLevels - 2; + const int kProximalLevel = kNumLevels - 2; const int kKeyPerSec = 1; const int kNumKeys = 100; @@ -2301,13 +2540,13 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) { SyncPoint::GetInstance()->SetCallBack( "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) { auto compaction = static_cast(arg); - if (compaction->output_level() == kPenultimateLevel) { + if (compaction->output_level() == kProximalLevel) { is_size_ratio_compaction_running = true; TEST_SYNC_POINT( - "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:" "SizeRatioCompaction1"); TEST_SYNC_POINT( - "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:" "SizeRatioCompaction2"); is_size_ratio_compaction_running = false; } @@ -2329,17 +2568,17 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) { verified_last_level_compaction = true; } TEST_SYNC_POINT( - "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:" "AutoCompactionPicked"); }); SyncPoint::GetInstance()->LoadDependency({ - {"PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + {"PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:" "SizeRatioCompaction1", - "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:DoneWrite"}, - {"PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:DoneWrite"}, + {"PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:" "AutoCompactionPicked", - "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:" "SizeRatioCompaction2"}, }); @@ -2356,11 +2595,11 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) { } TEST_SYNC_POINT( - "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:DoneWrite"); + "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:DoneWrite"); // wait for periodic compaction time and flush to trigger the periodic // compaction, which should be blocked by ongoing compaction in the - // penultimate level + // proximal level mock_clock_->MockSleepForSeconds(10000); for (int i = 0; i < 3 * kNumKeys; i++) { ASSERT_OK(Put(Key(i), rnd.RandomString(10))); @@ -2423,7 +2662,7 @@ class ThreeRangesPartitionerFactory : public SstPartitionerFactory { } }; -TEST_P(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) { +TEST_P(PrecludeLastLevelTest, PartialProximalLevelCompaction) { const int kNumTrigger = 4; const int kNumLevels = 7; const int kKeyPerSec = 10; @@ -2593,8 +2832,8 @@ TEST_P(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { auto compaction = static_cast(arg); if (compaction->SupportsPerKeyPlacement()) { - ASSERT_EQ(compaction->GetPenultimateOutputRangeType(), - Compaction::PenultimateOutputRangeType::kNonLastRange); + ASSERT_EQ(compaction->GetProximalOutputRangeType(), + Compaction::ProximalOutputRangeType::kNonLastRange); per_key_comp_num++; } }); @@ -2650,7 +2889,7 @@ TEST_P(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { ASSERT_EQ(3, per_key_comp_num); verify_db(); - // Finish off the penultimate level. + // Finish off the proximal level. ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ("0,0,0,0,0,0,3", FilesPerLevel()); verify_db(); diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index f9c0f47ef7be..fdd042fcd717 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -258,12 +258,12 @@ class ComparatorDBTest private: std::string dbname_; Env* env_; - DB* db_; + std::unique_ptr db_; Options last_options_; std::unique_ptr comparator_guard; public: - ComparatorDBTest() : env_(Env::Default()), db_(nullptr) { + ComparatorDBTest() : env_(Env::Default()) { kTestComparator = BytewiseComparator(); dbname_ = test::PerThreadDBPath("comparator_db_test"); BlockBasedTableOptions toptions; @@ -274,12 +274,12 @@ class ComparatorDBTest } ~ComparatorDBTest() override { - delete db_; + db_.reset(); EXPECT_OK(DestroyDB(dbname_, last_options_)); kTestComparator = BytewiseComparator(); } - DB* GetDB() { return db_; } + DB* GetDB() { return db_.get(); } void SetOwnedComparator(const Comparator* cmp, bool owner = true) { if (owner) { @@ -301,14 +301,12 @@ class ComparatorDBTest } void Destroy() { - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, last_options_)); } Status TryReopen() { - delete db_; - db_ = nullptr; + db_.reset(); last_options_.create_if_missing = true; return DB::Open(last_options_, dbname_, &db_); @@ -318,7 +316,7 @@ class ComparatorDBTest INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest, testing::Values(test::kDefaultFormatVersion)); INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest, - testing::Values(kLatestFormatVersion)); + testing::Values(kLatestBbtFormatVersion)); TEST_P(ComparatorDBTest, Bytewise) { for (int rand_seed = 301; rand_seed < 306; rand_seed++) { diff --git a/db/convenience.cc b/db/convenience.cc index 47ce59f2f8d1..5560cffe5fda 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -26,6 +26,17 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { + std::vector range_opts(n); + for (size_t i = 0; i < n; ++i) { + range_opts[i] = {OptSlice::CopyFromPtr(ranges[i].start), + OptSlice::CopyFromPtr(ranges[i].limit)}; + } + return DeleteFilesInRanges(db, column_family, range_opts.data(), n, + include_end); +} + +Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, + const RangeOpt* ranges, size_t n, bool include_end) { return (static_cast_with_check(db->GetRootDB())) ->DeleteFilesInRanges(column_family, ranges, n, include_end); } @@ -54,7 +65,7 @@ Status VerifySstFileChecksum(const Options& options, } Status VerifySstFileChecksumInternal(const Options& options, - const EnvOptions& env_options, + const FileOptions& file_options, const ReadOptions& read_options, const std::string& file_path, const SequenceNumber& largest_seqno) { @@ -63,8 +74,8 @@ Status VerifySstFileChecksumInternal(const Options& options, InternalKeyComparator internal_comparator(options.comparator); ImmutableOptions ioptions(options); - Status s = ioptions.fs->NewRandomAccessFile( - file_path, FileOptions(env_options), &file, nullptr); + Status s = + ioptions.fs->NewRandomAccessFile(file_path, file_options, &file, nullptr); if (s.ok()) { s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); } else { @@ -82,9 +93,10 @@ Status VerifySstFileChecksumInternal(const Options& options, nullptr /* file_read_hist */, ioptions.rate_limiter.get())); const bool kImmortal = true; auto reader_options = TableReaderOptions( - ioptions, options.prefix_extractor, env_options, internal_comparator, - options.block_protection_bytes_per_key, false /* skip_filters */, - !kImmortal, false /* force_direct_prefetch */, -1 /* level */); + ioptions, options.prefix_extractor, options.compression_manager.get(), + file_options, internal_comparator, options.block_protection_bytes_per_key, + false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */, + -1 /* level */); reader_options.largest_seqno = largest_seqno; s = options.table_factory->NewTableReader( read_options, reader_options, std::move(file_reader), file_size, diff --git a/db/convenience_impl.h b/db/convenience_impl.h index 32f4476bde99..5e8d6d49667c 100644 --- a/db/convenience_impl.h +++ b/db/convenience_impl.h @@ -5,10 +5,11 @@ #pragma once #include "rocksdb/db.h" +#include "rocksdb/file_system.h" namespace ROCKSDB_NAMESPACE { Status VerifySstFileChecksumInternal(const Options& options, - const EnvOptions& env_options, + const FileOptions& file_options, const ReadOptions& read_options, const std::string& file_path, const SequenceNumber& largest_seqno = 0); diff --git a/db/corruption_test.cc b/db/corruption_test.cc index e99612c2b8a3..448d2c9d94c0 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -73,7 +73,7 @@ class CorruptionTest : public testing::Test { std::string dbname_; std::shared_ptr tiny_cache_; Options options_; - DB* db_; + std::unique_ptr db_; CorruptionTest() { // If LRU cache shard bit is smaller than 2 (or -1 which will automatically @@ -105,8 +105,7 @@ class CorruptionTest : public testing::Test { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->LoadDependency({}); SyncPoint::GetInstance()->ClearAllCallBacks(); - delete db_; - db_ = nullptr; + db_.reset(); if (getenv("KEEP_DB")) { fprintf(stdout, "db is still at %s\n", dbname_.c_str()); } else { @@ -116,14 +115,12 @@ class CorruptionTest : public testing::Test { } } - void CloseDb() { - delete db_; - db_ = nullptr; - } + void CloseDb() { db_.reset(); } + + DBImpl* dbfull() { return static_cast_with_check(db_.get()); } Status TryReopen(Options* options = nullptr) { - delete db_; - db_ = nullptr; + db_.reset(); Options opt = (options ? *options : options_); if (opt.env == Options().env) { // If env is not overridden, replace it with ErrorEnv. @@ -141,8 +138,7 @@ class CorruptionTest : public testing::Test { void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); } void RepairDB() { - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_)); } @@ -151,8 +147,7 @@ class CorruptionTest : public testing::Test { WriteBatch batch; for (int i = 0; i < n; i++) { if (flush_every != 0 && i != 0 && i % flush_every == 0) { - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); } // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); Slice key = Key(i + start, &key_space); @@ -436,14 +431,14 @@ TEST_F(CorruptionTest, NewFileErrorDuringWrite) { TEST_F(CorruptionTest, TableFile) { Build(100); - DBImpl* dbi = static_cast_with_check(db_); + DBImpl* dbi = dbfull(); ASSERT_OK(dbi->TEST_FlushMemTable()); ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); Corrupt(kTableFile, 100, 1); Check(99, 99); - ASSERT_NOK(dbi->VerifyChecksum()); + ASSERT_NOK(db_->VerifyChecksum()); } TEST_F(CorruptionTest, VerifyChecksumReadahead) { @@ -460,14 +455,14 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) { Reopen(&options); Build(10000); - DBImpl* dbi = static_cast_with_check(db_); + DBImpl* dbi = dbfull(); ASSERT_OK(dbi->TEST_FlushMemTable()); ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); senv.count_random_reads_ = true; senv.random_read_counter_.Reset(); - ASSERT_OK(dbi->VerifyChecksum()); + ASSERT_OK(db_->VerifyChecksum()); // Make sure the counter is enabled. ASSERT_GT(senv.random_read_counter_.Read(), 0); @@ -480,7 +475,7 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) { senv.random_read_bytes_counter_ = 0; ReadOptions ro; ro.readahead_size = size_t{32 * 1024}; - ASSERT_OK(dbi->VerifyChecksum(ro)); + ASSERT_OK(db_->VerifyChecksum(ro)); // The SST file is about 10MB. We set readahead size to 32KB. // Give 0 to 20 reads for metadata blocks, and allow real read // to range from 24KB to 48KB. The lower bound would be: @@ -494,8 +489,7 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) { // disabled). options.allow_mmap_reads = true; Reopen(&options); - dbi = static_cast(db_); - ASSERT_OK(dbi->VerifyChecksum(ro)); + ASSERT_OK(db_->VerifyChecksum(ro)); CloseDb(); } @@ -508,18 +502,16 @@ TEST_F(CorruptionTest, TableFileIndexData) { Reopen(&options); // build 2 tables, flush at 5000 Build(10000, 5000); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // corrupt an index block of an entire file Corrupt(kTableFile, -2000, 500); options.paranoid_checks = false; Reopen(&options); - dbi = static_cast_with_check(db_); // one full file may be readable, since only one was corrupted // the other file should be fully non-readable, since index was corrupted Check(0, 5000, ReadOptions(true, true)); - ASSERT_NOK(dbi->VerifyChecksum()); + ASSERT_NOK(db_->VerifyChecksum()); // In paranoid mode, the db cannot be opened due to the corrupted file. ASSERT_TRUE(TryReopen().IsCorruption()); @@ -527,8 +519,7 @@ TEST_F(CorruptionTest, TableFileIndexData) { TEST_F(CorruptionTest, TableFileFooterMagic) { Build(100); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); Check(100, 100); // Corrupt the whole footer Corrupt(kTableFile, -100, 100); @@ -543,8 +534,7 @@ TEST_F(CorruptionTest, TableFileFooterMagic) { TEST_F(CorruptionTest, TableFileFooterNotMagic) { Build(100); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); Check(100, 100); // Corrupt footer except magic number Corrupt(kTableFile, -100, 92); @@ -556,10 +546,77 @@ TEST_F(CorruptionTest, TableFileFooterNotMagic) { ASSERT_TRUE(s.ToString().find(".sst") != std::string::npos); } +TEST_F(CorruptionTest, DBOpenWithWrongFileSize) { + // Validate that when paranoid flag is true, DB::Open() fails if one of the + // file corrupted. Validate that when paranoid flag is false, DB::Open() + // succeed if one of the file corrupted, and the healthy file is readable. + CloseDb(); + + const std::string test_cf_name = "test_cf"; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions()); + + { + options_.create_missing_column_families = true; + std::vector cfhs; + ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + + ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v")); + ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k1", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2")); + for (auto* cfh : cfhs) { + delete cfh; + } + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + // ******************************************** + // Corrupt the file by making the file bigger + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + std::string filename = dbname_ + metadata[0].name; + const auto& fs = options_.env->GetFileSystem(); + { + std::unique_ptr f; + ASSERT_OK(fs->ReopenWritableFile(filename, FileOptions(), &f, nullptr)); + ASSERT_OK(f->Append("blahblah", IOOptions(), nullptr)); + ASSERT_OK(f->Close(IOOptions(), nullptr)); + } + CloseDb(); + } + + // DB failed to open due to one of the file is corrupted, as paranoid flag is + // true + options_.paranoid_checks = true; + std::vector cfhs; + Status s; + s = DB::Open(options_, dbname_, cf_descs, &cfhs, &db_); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos); + + // DB opened successfully, as paranoid flag is false, validate the one that is + // healthy is still accessible + options_.paranoid_checks = false; + ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), cfhs[1], "k1", &v)); + ASSERT_EQ(v, "v1"); + + // Validate the default column family is corrupted + Check(0, 0); + s = db_->Get(ReadOptions(), cfhs[0], "k1", &v); + ASSERT_TRUE(s.IsCorruption()); + + delete cfhs[1]; + delete cfhs[0]; +} + TEST_F(CorruptionTest, TableFileWrongSize) { Build(100); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); Check(100, 100); // ******************************************** @@ -579,13 +636,16 @@ TEST_F(CorruptionTest, TableFileWrongSize) { // DB actually accepts this without paranoid checks, relying on size // recorded in manifest to locate the SST footer. options_.paranoid_checks = false; - options_.skip_checking_sst_file_sizes_on_db_open = false; Reopen(); - Check(100, 100); + // As footer could not be extraced, file is completely unreadable + Check(0, 0); + std::string v; + auto s = db_->Get(ReadOptions(), "k1", &v); + ASSERT_TRUE(s.IsCorruption()); // But reports the issue with paranoid checks options_.paranoid_checks = true; - Status s = TryReopen(); + s = TryReopen(); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos); @@ -639,12 +699,11 @@ TEST_F(CorruptionTest, SequenceNumberRecovery) { TEST_F(CorruptionTest, CorruptedDescriptor) { ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK( - dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr)); Corrupt(kDescriptorFile, 0, 1000); Status s = TryReopen(); @@ -663,7 +722,7 @@ TEST_F(CorruptionTest, CompactionInputError) { options.env = env_.get(); Reopen(&options); Build(10); - DBImpl* dbi = static_cast_with_check(db_); + DBImpl* dbi = dbfull(); ASSERT_OK(dbi->TEST_FlushMemTable()); ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); @@ -671,12 +730,12 @@ TEST_F(CorruptionTest, CompactionInputError) { Corrupt(kTableFile, 100, 1); Check(9, 9); - ASSERT_NOK(dbi->VerifyChecksum()); + ASSERT_NOK(db_->VerifyChecksum()); // Force compactions by writing lots of values Build(10000); Check(10000, 10000); - ASSERT_NOK(dbi->VerifyChecksum()); + ASSERT_NOK(db_->VerifyChecksum()); } TEST_F(CorruptionTest, CompactionInputErrorParanoid) { @@ -687,14 +746,14 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) { options.write_buffer_size = 131072; options.max_write_buffer_number = 2; Reopen(&options); - DBImpl* dbi = static_cast_with_check(db_); + DBImpl* dbi = dbfull(); // Fill levels >= 1 - for (int level = 1; level < dbi->NumberLevels(); level++) { - ASSERT_OK(dbi->Put(WriteOptions(), "", "begin")); - ASSERT_OK(dbi->Put(WriteOptions(), "~", "end")); + for (int level = 1; level < db_->NumberLevels(); level++) { + ASSERT_OK(db_->Put(WriteOptions(), "", "begin")); + ASSERT_OK(db_->Put(WriteOptions(), "~", "end")); ASSERT_OK(dbi->TEST_FlushMemTable()); - for (int comp_level = 0; comp_level < dbi->NumberLevels() - level; + for (int comp_level = 0; comp_level < db_->NumberLevels() - level; ++comp_level) { ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr)); } @@ -702,7 +761,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) { Reopen(&options); - dbi = static_cast_with_check(db_); + dbi = dbfull(); Build(10); ASSERT_OK(dbi->TEST_FlushMemTable()); ASSERT_OK(dbi->TEST_WaitForCompact()); @@ -710,7 +769,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) { CorruptTableFileAtLevel(0, 100, 1); Check(9, 9); - ASSERT_NOK(dbi->VerifyChecksum()); + ASSERT_NOK(db_->VerifyChecksum()); // Write must eventually fail because of corrupted table Status s; @@ -729,17 +788,16 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) { TEST_F(CorruptionTest, UnrelatedKeys) { Build(10); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); Corrupt(kTableFile, 100, 1); - ASSERT_NOK(dbi->VerifyChecksum()); + ASSERT_NOK(db_->VerifyChecksum()); std::string tmp1, tmp2; ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); std::string v; ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); ASSERT_EQ(Value(1000, &tmp2).ToString(), v); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); ASSERT_EQ(Value(1000, &tmp2).ToString(), v); } @@ -786,14 +844,12 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) { Reopen(&options); Build(10); ASSERT_OK(db_->Flush(FlushOptions())); - DBImpl* dbi = static_cast_with_check(db_); std::vector metadata; - dbi->GetLiveFilesMetaData(&metadata); + db_->GetLiveFilesMetaData(&metadata); ASSERT_GT(metadata.size(), 0); std::string filename = dbname_ + metadata[0].name; - delete db_; - db_ = nullptr; + db_.reset(); if (iter == 0) { // corrupt file size std::unique_ptr file; @@ -825,8 +881,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { options.create_if_missing = true; Status s; for (const auto& mode : corruption_modes) { - delete db_; - db_ = nullptr; + db_.reset(); s = DestroyDB(dbname_, options); ASSERT_OK(s); std::shared_ptr mock = @@ -853,8 +908,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { options.create_if_missing = true; Status s; for (const auto& mode : corruption_modes) { - delete db_; - db_ = nullptr; + db_.reset(); s = DestroyDB(dbname_, options); ASSERT_OK(s); std::shared_ptr mock = @@ -863,13 +917,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { ASSERT_OK(DB::Open(options, dbname_, &db_)); assert(db_ != nullptr); // suppress false clang-analyze report Build(100, 2); - // ASSERT_OK(db_->Flush(FlushOptions())); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); mock->SetCorruptionMode(mode); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; - s = dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr); + s = db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr); if (mode == mock::MockTableFactory::kCorruptNone) { ASSERT_OK(s); } else { @@ -885,8 +937,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { options.paranoid_file_checks = true; options.create_if_missing = true; for (bool do_flush : {true, false}) { - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DB::Open(options, dbname_, &db_)); std::string start, end; @@ -903,12 +954,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { if (do_flush) { ASSERT_OK(db_->Flush(FlushOptions())); } else { - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK( - dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr)); } db_->ReleaseSnapshot(snap); } @@ -921,8 +971,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { options.paranoid_file_checks = true; options.create_if_missing = true; for (bool do_flush : {true, false}) { - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DB::Open(options, dbname_, &db_)); assert(db_ != nullptr); // suppress false clang-analyze report @@ -942,12 +991,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { if (do_flush) { ASSERT_OK(db_->Flush(FlushOptions())); } else { - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK( - dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr)); } db_->ReleaseSnapshot(snap); } @@ -960,8 +1008,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { options.paranoid_file_checks = true; options.create_if_missing = true; for (bool do_flush : {true, false}) { - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DB::Open(options, dbname_, &db_)); assert(db_ != nullptr); // suppress false clang-analyze report @@ -978,12 +1025,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { if (do_flush) { ASSERT_OK(db_->Flush(FlushOptions())); } else { - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK( - dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr)); } db_->ReleaseSnapshot(snap); } @@ -996,8 +1042,7 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { options.create_if_missing = true; options.allow_data_in_errors = true; auto mode = mock::MockTableFactory::kCorruptKey; - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, options)); std::shared_ptr mock = @@ -1009,12 +1054,11 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { assert(db_ != nullptr); // suppress false clang-analyze report Build(100, 2); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; Status s = - dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr); + db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr); ASSERT_NOK(s); ASSERT_TRUE(s.IsCorruption()); } @@ -1025,8 +1069,7 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) { options.env = env_.get(); options.paranoid_file_checks = false; options.create_if_missing = true; - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, options)); std::shared_ptr mock = std::make_shared(); @@ -1035,14 +1078,13 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) { assert(db_ != nullptr); // suppress false clang-analyze report mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey); Build(100, 2); - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_NOK( - dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr)); + db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr)); } TEST_F(CorruptionTest, FlushKeyOrderCheck) { @@ -1069,7 +1111,7 @@ TEST_F(CorruptionTest, FlushKeyOrderCheck) { } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Status s = static_cast_with_check(db_)->TEST_FlushMemTable(); + Status s = dbfull()->TEST_FlushMemTable(); ASSERT_NOK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -1193,7 +1235,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { // while other don't. { ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_)); - auto* dbimpl = static_cast_with_check(db_); + auto* dbimpl = dbfull(); assert(dbimpl); // Write one key to test_cf. diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 78ae86683318..1ece0e3630ab 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -21,18 +21,18 @@ class CuckooTableDBTest : public testing::Test { private: std::string dbname_; Env* env_; - DB* db_; + std::unique_ptr db_; public: CuckooTableDBTest() : env_(Env::Default()) { dbname_ = test::PerThreadDBPath("cuckoo_table_db_test"); EXPECT_OK(DestroyDB(dbname_, Options())); - db_ = nullptr; + db_.reset(); Reopen(); } ~CuckooTableDBTest() override { - delete db_; + db_.reset(); EXPECT_OK(DestroyDB(dbname_, Options())); } @@ -47,12 +47,11 @@ class CuckooTableDBTest : public testing::Test { return options; } - DBImpl* dbfull() { return static_cast_with_check(db_); } + DBImpl* dbfull() { return static_cast_with_check(db_.get()); } // The following util methods are copied from plain_table_db_test. void Reopen(Options* options = nullptr) { - delete db_; - db_ = nullptr; + db_.reset(); Options opts; if (options != nullptr) { opts = *options; @@ -66,8 +65,7 @@ class CuckooTableDBTest : public testing::Test { void DestroyAndReopen(Options* options) { assert(options); ASSERT_OK(db_->Close()); - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, *options)); Reopen(options); } @@ -130,7 +128,7 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(dbfull()->TEST_FlushMemTable()); TablePropertiesCollection ptc; - ASSERT_OK(static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc)); VerifySstUniqueIds(ptc); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(3U, ptc.begin()->second->num_entries); @@ -147,7 +145,7 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(Put("key6", "v6")); ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_OK(static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc)); VerifySstUniqueIds(ptc); ASSERT_EQ(2U, ptc.size()); auto row = ptc.begin(); @@ -165,7 +163,7 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(Delete("key5")); ASSERT_OK(Delete("key4")); ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_OK(static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc)); VerifySstUniqueIds(ptc); ASSERT_EQ(3U, ptc.size()); row = ptc.begin(); @@ -190,7 +188,7 @@ TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) { ASSERT_OK(dbfull()->TEST_FlushMemTable()); TablePropertiesCollection ptc; - ASSERT_OK(static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc)); VerifySstUniqueIds(ptc); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(2U, ptc.begin()->second->num_entries); diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index edb10693affd..71bf37f197fe 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -91,17 +91,15 @@ class DBBasicTest : public DBTestBase { TEST_F(DBBasicTest, OpenWhenOpen) { Options options = CurrentOptions(); options.env = env_; - DB* db2 = nullptr; + std::unique_ptr db2; Status s = DB::Open(options, dbname_, &db2); - ASSERT_NOK(s) << [db2]() { - delete db2; + ASSERT_NOK(s) << [&db2]() { + db2.reset(); return "db2 open: ok"; }(); ASSERT_EQ(Status::Code::kIOError, s.code()); ASSERT_EQ(Status::SubCode::kNone, s.subcode()); ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr); - - delete db2; } TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) { @@ -161,6 +159,7 @@ TEST_F(DBBasicTest, UniqueSession) { ASSERT_EQ(sid2, sid3); + DestroyAndReopen(options); CreateAndReopenWithCF({"goku"}, options); ASSERT_OK(db_->GetDbSessionId(sid1)); ASSERT_OK(Put("bar", "e1")); @@ -179,6 +178,7 @@ TEST_F(DBBasicTest, UniqueSession) { TEST_F(DBBasicTest, ReadOnlyDB) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Flush()); ASSERT_OK(Put("foo", "v3")); Close(); @@ -208,10 +208,11 @@ TEST_F(DBBasicTest, ReadOnlyDB) { auto options = CurrentOptions(); assert(options.env == env_); - ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_OK(EnforcedReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); verify_all_iters(); + ASSERT_EQ(Flush().code(), Status::Code::kNotSupported); Close(); // Reopen and flush memtable. @@ -219,26 +220,75 @@ TEST_F(DBBasicTest, ReadOnlyDB) { ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. - ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_OK(EnforcedReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); verify_all_iters(); - ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); + ASSERT_EQ(db_->SyncWAL().code(), Status::Code::kNotSupported); + + // More ops that should fail + std::vector cfhs{{}}; + ASSERT_EQ(db_->CreateColumnFamily(options, "blah", &cfhs[0]).code(), + Status::Code::kNotSupported); + + ASSERT_EQ(db_->CreateColumnFamilies(options, {"blah"}, &cfhs).code(), + Status::Code::kNotSupported); + + std::vector cfds; + cfds.push_back({"blah", options}); + ASSERT_EQ(db_->CreateColumnFamilies(cfds, &cfhs).code(), + Status::Code::kNotSupported); } -// TODO akanksha: Update the test to check that combination -// does not actually write to FS (use open read-only with -// CompositeEnvWrapper+ReadOnlyFileSystem). -TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) { +TEST_F(DBBasicTest, ReadOnlyDBFlushWAL) { + // Test that FlushWAL returns NotSupported on read-only DB, and that + // GetLiveFilesStorageInfo works correctly even with manual_wal_flush=true. + // This is a regression test for a bug where GetLiveFilesStorageInfo would + // crash on read-only DBs with manual_wal_flush=true because FlushWAL + // accessed logs_.back() on an empty deque. + auto options = CurrentOptions(); + options.manual_wal_flush = true; + DestroyAndReopen(options); ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("baz", "v3")); // Unflushed data in WAL + Close(); + + // Reopen as read-only + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_EQ("v3", Get("baz")); + + // FlushWAL should return NotSupported (not crash) + ASSERT_EQ(db_->FlushWAL(/*sync=*/false).code(), Status::Code::kNotSupported); + ASSERT_EQ(db_->FlushWAL(/*sync=*/true).code(), Status::Code::kNotSupported); + + // GetLiveFilesStorageInfo should succeed (previously crashed with + // manual_wal_flush=true because it called FlushWAL which accessed + // logs_.back() on empty deque) + LiveFilesStorageInfoOptions lfsi_opts; + lfsi_opts.wal_size_for_flush = 0; + std::vector files; + ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files)); + ASSERT_GT(files.size(), 0); + Close(); +} +TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) { auto options = CurrentOptions(); + options.write_dbid_to_manifest = false; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + Close(); + options.write_dbid_to_manifest = true; assert(options.env == env_); - ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_OK(EnforcedReadOnlyReopen(options)); std::string db_id1; ASSERT_OK(db_->GetDbIdentity(db_id1)); ASSERT_EQ("v3", Get("foo")); @@ -258,7 +308,7 @@ TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) { ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. - ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_OK(EnforcedReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); @@ -534,14 +584,14 @@ TEST_F(DBBasicTest, GetSnapshot) { TEST_F(DBBasicTest, CheckLock) { do { - DB* localdb = nullptr; + std::unique_ptr localdb; Options options = CurrentOptions(); ASSERT_OK(TryReopen(options)); // second open should fail Status s = DB::Open(options, dbname_, &localdb); - ASSERT_NOK(s) << [localdb]() { - delete localdb; + ASSERT_NOK(s) << [&localdb]() { + localdb.reset(); return "localdb open: ok"; }(); #ifdef OS_LINUX @@ -660,30 +710,6 @@ TEST_F(DBBasicTest, Flush) { } while (ChangeCompactOptions()); } -TEST_F(DBBasicTest, ManifestRollOver) { - do { - Options options; - options.max_manifest_file_size = 10; // 10 bytes - options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, options); - { - ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); - ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); - ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); - uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); - ASSERT_OK(Flush(1)); // This should trigger LogAndApply. - uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); - ASSERT_GT(manifest_after_flush, manifest_before_flush); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); - // check if a new manifest file got inserted or not. - ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); - ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); - ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); - } - } while (ChangeCompactOptions()); -} - TEST_F(DBBasicTest, IdentityAcrossRestarts) { constexpr size_t kMinIdSize = 10; do { @@ -834,7 +860,7 @@ TEST_F(DBBasicTest, Snapshot) { ASSERT_OK(Put(1, "foo", "1v3")); { - ManagedSnapshot s3(db_); + ManagedSnapshot s3(db_.get()); ASSERT_EQ(3U, GetNumSnapshots()); ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); @@ -957,7 +983,7 @@ TEST_F(DBBasicTest, DBOpen_Options) { Destroy(options); // Does not exist, and create_if_missing == false: error - DB* db = nullptr; + std::unique_ptr db; options.create_if_missing = false; Status s = DB::Open(options, dbname_, &db); ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); @@ -969,8 +995,7 @@ TEST_F(DBBasicTest, DBOpen_Options) { ASSERT_OK(s); ASSERT_TRUE(db != nullptr); - delete db; - db = nullptr; + db.reset(); // Does exist, and error_if_exists == true: error options.create_if_missing = false; @@ -986,8 +1011,7 @@ TEST_F(DBBasicTest, DBOpen_Options) { ASSERT_OK(s); ASSERT_TRUE(db != nullptr); - delete db; - db = nullptr; + db.reset(); } TEST_F(DBBasicTest, CompactOnFlush) { @@ -1292,7 +1316,7 @@ TEST_F(DBBasicTest, DBClose) { std::string dbname = test::PerThreadDBPath("db_close_test"); ASSERT_OK(DestroyDB(dbname, options)); - DB* db = nullptr; + std::unique_ptr db; TestEnv* env = new TestEnv(env_); std::unique_ptr local_env_guard(env); options.create_if_missing = true; @@ -1305,14 +1329,14 @@ TEST_F(DBBasicTest, DBClose) { ASSERT_EQ(env->GetCloseCount(), 1); ASSERT_EQ(s, Status::IOError()); - delete db; + db.reset(); ASSERT_EQ(env->GetCloseCount(), 1); // Do not call DB::Close() and ensure our logger Close() still gets called s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); - delete db; + db.reset(); ASSERT_EQ(env->GetCloseCount(), 2); // close by WaitForCompact() with close_db option @@ -1327,7 +1351,7 @@ TEST_F(DBBasicTest, DBClose) { // see TestLogger::CloseHelper() ASSERT_EQ(s, Status::IOError()); - delete db; + db.reset(); ASSERT_EQ(env->GetCloseCount(), 3); // Provide our own logger and ensure DB::Close() does not close it @@ -1338,7 +1362,7 @@ TEST_F(DBBasicTest, DBClose) { s = db->Close(); ASSERT_EQ(s, Status::OK()); - delete db; + db.reset(); ASSERT_EQ(env->GetCloseCount(), 3); options.info_log.reset(); ASSERT_EQ(env->GetCloseCount(), 4); @@ -1356,7 +1380,7 @@ TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) { ASSERT_OK(DestroyDB(dbname, options)); - DB* db = nullptr; + std::unique_ptr db; std::unique_ptr env = NewCompositeEnv( std::make_shared(FileSystem::Default())); options.create_if_missing = true; @@ -1374,7 +1398,7 @@ TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) { ASSERT_EQ(counted_fs->counters()->dir_opens, counted_fs->counters()->dir_closes); ASSERT_OK(s); - delete db; + db.reset(); } TEST_F(DBBasicTest, DBCloseFlushError) { @@ -1436,7 +1460,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) { } int get_sv_count = 0; - ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check(db_); + ROCKSDB_NAMESPACE::DBImpl* db = dbfull(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) { if (++get_sv_count == 2) { @@ -1508,10 +1532,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) { ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2"); for (int cf = 0; cf < 8; ++cf) { - auto* cfd = - static_cast_with_check( - static_cast_with_check(db_)->GetColumnFamilyHandle(cf)) - ->cfd(); + auto* cfd = static_cast_with_check( + dbfull()->GetColumnFamilyHandle(cf)) + ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete); } @@ -1597,10 +1620,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { "cf" + std::to_string(j) + "_val" + std::to_string(retries)); } for (int i = 0; i < 8; ++i) { - auto* cfd = - static_cast_with_check( - static_cast_with_check(db_)->GetColumnFamilyHandle(i)) - ->cfd(); + auto* cfd = static_cast_with_check( + dbfull()->GetColumnFamilyHandle(i)) + ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -1624,7 +1646,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) { } int get_sv_count = 0; - ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check(db_); + ROCKSDB_NAMESPACE::DBImpl* db = dbfull(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) { if (++get_sv_count == 2) { @@ -1665,10 +1687,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) { ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val"); } for (int i = 0; i < 8; ++i) { - auto* cfd = - static_cast_with_check( - static_cast_with_check(db_)->GetColumnFamilyHandle(i)) - ->cfd(); + auto* cfd = static_cast_with_check( + dbfull()->GetColumnFamilyHandle(i)) + ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); } } @@ -3273,9 +3294,8 @@ TEST_F(DBBasicTest, GetAllKeyVersions) { ASSERT_OK(Delete(std::to_string(i))); } std::vector key_versions; - ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), - std::numeric_limits::max(), - &key_versions)); + ASSERT_OK(GetAllKeyVersions( + db_.get(), {}, {}, std::numeric_limits::max(), &key_versions)); ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) { if (i % 3 == 0) { @@ -3284,7 +3304,7 @@ TEST_F(DBBasicTest, GetAllKeyVersions) { ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue"); } } - ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(), + ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[0], {}, {}, std::numeric_limits::max(), &key_versions)); ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); @@ -3299,10 +3319,17 @@ TEST_F(DBBasicTest, GetAllKeyVersions) { for (size_t i = 0; i + 1 != kNumDeletes; ++i) { ASSERT_OK(Delete(1, std::to_string(i))); } - ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(), + ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[1], {}, {}, std::numeric_limits::max(), &key_versions)); ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size()); + + // Change from historical behavior: empty key is now interpreted literally as + // a legal key (rather than as a "not present" key) + ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[1], Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(key_versions.size(), 0); } TEST_F(DBBasicTest, ValueTypeString) { @@ -3354,6 +3381,69 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { keys.data(), values.data(), statuses.data(), true); } +TEST_F(DBBasicTest, MultiGetWithSnapshotsAndPersistedTier) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + // Insert initial data + ASSERT_OK(Put(0, "key1", "value1_cf0")); + ASSERT_OK(Put(1, "key1", "value1_cf1")); + ASSERT_OK(Put(2, "key1", "value1_cf2")); + ASSERT_OK(Flush({0, 1, 2})); + for (auto cf : {0, 1, 2}) { + ASSERT_EQ(1, NumTableFilesAtLevel(0, cf)); + } + + ASSERT_OK(Put(0, "key1", "value2_cf0")); + ASSERT_OK(Put(1, "key1", "value2_cf1")); + ASSERT_OK(Put(2, "key1", "value2_cf2")); + + // Prepare for concurrent atomic flush + std::atomic flush_done(false); + std::thread flush_thread([&]() { + ASSERT_OK(Flush({0, 1, 2})); + flush_done.store(true); + }); + + // Perform MultiGet with snapshot and read_tier = kPersistentTier + ReadOptions ro; + const Snapshot* snapshot = db_->GetSnapshot(); + ro.snapshot = snapshot; + ro.read_tier = kPersistedTier; + + std::string k = "key1"; + std::vector keys(3, Slice(k)); + std::vector statuses(keys.size()); + std::vector cfs(keys.size()); + std::vector new_keys(keys.size()); + std::vector pin_values(keys.size()); + for (size_t i = 0; i < keys.size(); ++i) { + cfs[i] = handles_[i]; + } + db_->MultiGet(ro, cfs.size(), cfs.data(), keys.data(), pin_values.data(), + statuses.data()); + for (const auto& s : statuses) { + ASSERT_OK(s); + } + + if (pin_values[0] == "value1_cf0") { + // Check if the first value matches expected value + ASSERT_EQ(pin_values[1], "value1_cf1"); + ASSERT_EQ(pin_values[2], "value1_cf2"); + } else { + // If first value doesn't match, check if we got the updated values + ASSERT_EQ(pin_values[0], "value2_cf0"); + ASSERT_EQ(pin_values[1], "value2_cf1"); + ASSERT_EQ(pin_values[2], "value2_cf2"); + } + + flush_thread.join(); + db_->ReleaseSnapshot(snapshot); +} + TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) { Options options = CurrentOptions(); DestroyAndReopen(options); @@ -3808,6 +3898,75 @@ TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) { ASSERT_OK(iter->status()); } +TEST_F(DBBasicTest, BestEffortRecoveryFailureWithTableCacheUseAfterFree) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + // Force multiple manifest files + options.max_manifest_file_size = 1; + options.max_manifest_space_amp_pct = 0; + + DestroyAndReopen(options); + + // Disable file deletions to preserve old manifest files for + // best-efforts recovery to succeed + ASSERT_OK(db_->DisableFileDeletions()); + + // Create multiple SST files to populate TableCache during + // best-efforts recovery + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("key" + std::to_string(i), + std::string(1000, static_cast('a' + i)))); + ASSERT_OK(Flush()); + } + + // Verify we have multiple manifest files + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + int manifest_count = 0; + for (const auto& file : files) { + if (file.find("MANIFEST") != std::string::npos) { + manifest_count++; + } + } + ASSERT_GE(manifest_count, 2); + + // Inject corruption after TableCache is populated (count > 3), but only once + // (injected flag) to allow best-effort recovery to trigger retry and succeed. + // This coerce the bug: first recovery caches SSTs with reference to column + // family's options in table cache and retry deletes column family so the + // reference becomes dangling. + int count = 0; + bool injected = false; + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + count++; + if (count > 3 && !injected) { + ASSERT_NE(nullptr, arg); + *(static_cast(arg)) = + Status::Corruption("Injected corruption"); + injected = true; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + options.best_efforts_recovery = true; + + Status s = TryReopen(options); + ASSERT_OK(s); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + for (int i = 0; i < 10; i++) { + std::string value; + // Without the fix, ASAN detects use-after-free when accessing cached SST + // files that hold dangling references to deleted ioptions. + s = db_->Get(ReadOptions(), "key" + std::to_string(i), &value); + ASSERT_TRUE(s.ok() || s.IsNotFound()); + } +} + TEST_F(DBBasicTest, DisableTrackWal) { // If WAL tracking was enabled, and then disabled during reopen, // the previously tracked WALs should be removed from MANIFEST. @@ -4994,6 +5153,104 @@ TEST_F(DBBasicTest, VerifyFileChecksumsReadahead) { (sst_size + alignment - 1) / (alignment)); } +TEST_F(DBBasicTest, DisallowMemtableWrite) { + // This test is mostly about what you can't do with memtable writes + // disallowed. For what you can do, see + // ExternalSSTFileBasicTest.FailIfNotBottommostLevelAndDisallowMemtable + Options options_allow = GetDefaultOptions(); + options_allow.create_if_missing = true; + Options options_disallow = options_allow; + options_disallow.disallow_memtable_writes = true; + options_disallow.paranoid_memory_checks = true; + options_disallow.memtable_veirfy_per_key_checksum_on_seek = true; + + DestroyAndReopen(options_allow); + // CFs allowing and disallowing memtable write + CreateColumnFamilies({"cf1", "cf2"}, options_allow); + CreateColumnFamilies({"cf3"}, options_disallow); + // XXX: needed to get consistent handles_ mappings + ReopenWithColumnFamilies( + {"default", "cf1", "cf2", "cf3"}, + {options_allow, options_allow, options_allow, options_disallow}); + + EXPECT_EQ(Put(0, "a0", "1").code(), Status::Code::kOk); + EXPECT_EQ(Put(1, "a1", "1").code(), Status::Code::kOk); + EXPECT_EQ(Put(2, "a2", "1").code(), Status::Code::kOk); + EXPECT_EQ(Put(3, "a3", "1").code(), Status::Code::kInvalidArgument); + + EXPECT_EQ(Get(0, "a0"), "1"); + EXPECT_EQ(Get(1, "a1"), "1"); + EXPECT_EQ(Get(2, "a2"), "1"); + EXPECT_EQ(Get(3, "a3"), "NOT_FOUND"); + + EXPECT_EQ(Delete(0, "z0").code(), Status::Code::kOk); + EXPECT_EQ(Delete(1, "z1").code(), Status::Code::kOk); + EXPECT_EQ(Delete(2, "z2").code(), Status::Code::kOk); + EXPECT_EQ(Delete(3, "z3").code(), Status::Code::kInvalidArgument); + + WriteBatch wb; + EXPECT_EQ(wb.Put(handles_[0], "b0", "2").code(), Status::Code::kOk); + EXPECT_EQ(wb.Put(handles_[1], "b1", "2").code(), Status::Code::kOk); + EXPECT_EQ(wb.Put(handles_[2], "b2", "2").code(), Status::Code::kOk); + EXPECT_EQ(wb.Put(handles_[3], "b3", "2").code(), + Status::Code::kInvalidArgument); + ASSERT_OK(db_->Write({}, &wb)); + wb.Clear(); + + EXPECT_EQ(Get(0, "b0"), "2"); + EXPECT_EQ(Get(1, "b1"), "2"); + EXPECT_EQ(Get(2, "b2"), "2"); + EXPECT_EQ(Get(3, "b3"), "NOT_FOUND"); + + std::unique_ptr iter( + dbfull()->NewIterator(ReadOptions(), handles_[3])); + iter->Seek("a3"); + ASSERT_OK(iter->status()); + iter.reset(); + // When the DB is re-opened with WAL entries for a CF that is newly setting + // disallow_memtable_writes, we detect that and fail the open gracefully. + ASSERT_EQ(TryReopenWithColumnFamilies( + {"default", "cf1", "cf2", "cf3"}, + {options_allow, options_allow, options_disallow, options_allow}) + .code(), + Status::Code::kInvalidArgument); + + // Successfully opening with allow creates L0 files from the WAL + ReopenWithColumnFamilies({"default", "cf1", "cf2", "cf3"}, options_allow); + + EXPECT_EQ(Get(0, "a0"), "1"); + EXPECT_EQ(Get(1, "a1"), "1"); + EXPECT_EQ(Get(2, "a2"), "1"); + EXPECT_EQ(Get(3, "a3"), "NOT_FOUND"); + + // Now able to disallow on CF2 because no relevant WAL entries + ReopenWithColumnFamilies( + {"default", "cf1", "cf2", "cf3"}, + {options_allow, options_allow, options_disallow, options_allow}); + + EXPECT_EQ(Get(0, "a0"), "1"); + EXPECT_EQ(Get(1, "a1"), "1"); + EXPECT_EQ(Get(2, "a2"), "1"); + EXPECT_EQ(Get(3, "a3"), "NOT_FOUND"); + + // Now able to write to CF 3 but not CF 2 + EXPECT_EQ(Put(0, "c0", "3").code(), Status::Code::kOk); + EXPECT_EQ(Put(1, "c1", "3").code(), Status::Code::kOk); + EXPECT_EQ(Put(2, "c2", "3").code(), Status::Code::kInvalidArgument); + EXPECT_EQ(Put(3, "c3", "3").code(), Status::Code::kOk); + + EXPECT_EQ(Get(0, "c0"), "3"); + EXPECT_EQ(Get(1, "c1"), "3"); + EXPECT_EQ(Get(2, "c2"), "NOT_FOUND"); + EXPECT_EQ(Get(3, "c3"), "3"); + + // disallow_memtable_writes not supported on default column family. + // (Would be complicated to make a WriteBatch aware of the setting in order + // to reject the write before entering the write path.) + Destroy(options_allow); + EXPECT_EQ(TryReopen(options_disallow).code(), Status::Code::kInvalidArgument); +} + // TODO: re-enable after we provide finer-grained control for WAL tracking to // meet the needs of different use cases, durability levels and recovery modes. TEST_F(DBBasicTest, DISABLED_ManualWalSync) { @@ -5210,6 +5467,94 @@ INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline, ::testing::Values(std::make_tuple(true, false), std::make_tuple(false, true), std::make_tuple(true, true))); + +// FileSystemWrapper that captures FileOptions passed to NewRandomAccessFile +// for .sst files, so we can verify file_checksum fields are populated. +class ChecksumCapturingFS : public FileSystemWrapper { + public: + explicit ChecksumCapturingFS(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + + static const char* kClassName() { return "ChecksumCapturingFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + if (fname.find(".sst") != std::string::npos) { + std::lock_guard lock(mu_); + captured_file_checksum_ = opts.file_checksum; + captured_file_checksum_func_name_ = opts.file_checksum_func_name; + capture_count_++; + } + return target()->NewRandomAccessFile(fname, opts, result, dbg); + } + + std::string GetCapturedFileChecksum() { + std::lock_guard lock(mu_); + return captured_file_checksum_; + } + + std::string GetCapturedFileChecksumFuncName() { + std::lock_guard lock(mu_); + return captured_file_checksum_func_name_; + } + + int GetCaptureCount() { + std::lock_guard lock(mu_); + return capture_count_; + } + + void Reset() { + std::lock_guard lock(mu_); + captured_file_checksum_.clear(); + captured_file_checksum_func_name_.clear(); + capture_count_ = 0; + } + + private: + std::mutex mu_; + std::string captured_file_checksum_; + std::string captured_file_checksum_func_name_; + int capture_count_ = 0; +}; + +TEST_F(DBBasicTest, FileChecksumInFileOptions) { + // Verify that file_checksum and file_checksum_func_name from FileMetaData + // are propagated through FileOptions when opening SST files. + auto capturing_fs = + std::make_shared(env_->GetFileSystem()); + std::unique_ptr env(new CompositeEnvWrapper(env_, capturing_fs)); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env.get(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + DestroyAndReopen(options); + + // Write data and flush to create an SST with a file checksum. + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Flush()); + + // Reset captures, then reopen to trigger TableCache SST open. + capturing_fs->Reset(); + Reopen(options); + + // Read to trigger SST open through TableCache::GetTableReader. + ASSERT_EQ("value1", Get("key1")); + + // Verify that checksum fields were populated. + ASSERT_GT(capturing_fs->GetCaptureCount(), 0); + ASSERT_FALSE(capturing_fs->GetCapturedFileChecksum().empty()); + ASSERT_NE(capturing_fs->GetCapturedFileChecksumFuncName(), + capturing_fs->GetCapturedFileChecksum()); + ASSERT_EQ(capturing_fs->GetCapturedFileChecksumFuncName(), + "FileChecksumCrc32c"); + + Close(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index cafb3710092d..1433bd6014e6 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -506,6 +506,8 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) { table_options.prepopulate_block_cache = BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + // Include a compression dictionary block + options.compression_opts.max_dict_bytes = 123; DestroyAndReopen(options); std::string value(kValueSize, 'a'); @@ -537,6 +539,9 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) { options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT)); } ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + + // Including compression dict + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_MISS)); } // Verify compaction not counted @@ -824,68 +829,78 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { const int kNumEntriesPerFile = 128; const int kNumBytesPerEntry = 1024; - // Try all the available libraries that support dictionary compression - std::vector compression_types; - if (Zlib_Supported()) { - compression_types.push_back(kZlibCompression); - } - if (LZ4_Supported()) { - compression_types.push_back(kLZ4Compression); - compression_types.push_back(kLZ4HCCompression); - } - if (ZSTD_Supported()) { - compression_types.push_back(kZSTD); - } + std::vector dict_compressions = + GetSupportedDictCompressions(); Random rnd(301); - for (auto compression_type : compression_types) { - Options options = CurrentOptions(); - options.bottommost_compression = compression_type; - options.bottommost_compression_opts.max_dict_bytes = 4096; - options.bottommost_compression_opts.enabled = true; - options.create_if_missing = true; - options.num_levels = 2; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = true; - table_options.block_cache.reset(new MockCache()); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); + // Format version before and after compression handling changes + for (int format_version : {6, 7}) { + // Test all supported compression types because (at least historically) + // dictionary compression could be enabled and a dictionary block saved + // but ignored by some compression types. Ensure we at least don't crash + // or return corruption for those. + for (auto compression_type : GetSupportedCompressions()) { + // Extra handling checks only for types actually supporting dictionary + // compression. + bool dict_supported = + std::count(dict_compressions.begin(), dict_compressions.end(), + compression_type) > 0; - RecordCacheCountersForCompressionDict(options); + Options options = CurrentOptions(); + options.bottommost_compression = compression_type; + options.bottommost_compression_opts.max_dict_bytes = 4096; + options.bottommost_compression_opts.enabled = true; + options.create_if_missing = true; + options.num_levels = 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache.reset(new MockCache()); + table_options.format_version = format_version; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); - for (int i = 0; i < kNumFiles; ++i) { - ASSERT_EQ(i, NumTableFilesAtLevel(0, 0)); - for (int j = 0; j < kNumEntriesPerFile; ++j) { - std::string value = rnd.RandomString(kNumBytesPerEntry); - ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str())); + RecordCacheCountersForCompressionDict(options); + + for (int i = 0; i < kNumFiles; ++i) { + ASSERT_EQ(i, NumTableFilesAtLevel(0, 0)); + for (int j = 0; j < kNumEntriesPerFile; ++j) { + std::string value = rnd.RandomString(kNumBytesPerEntry); + ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str())); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1)); + + if (dict_supported) { + // Compression dictionary blocks are preloaded. + CheckCacheCountersForCompressionDict( + options, kNumFiles /* expected_compression_dict_misses */, + 0 /* expected_compression_dict_hits */, + kNumFiles /* expected_compression_dict_inserts */); + } + + // Seek to a key in a file. It should cause the SST's dictionary + // meta-block to be read. + RecordCacheCounters(options); + RecordCacheCountersForCompressionDict(options); + ReadOptions read_options; + ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1))); + + if (dict_supported) { + // Two block hits: index and dictionary since they are prefetched + // One block missed/added: data block + CheckCacheCounters(options, 1 /* expected_misses */, + 2 /* expected_hits */, 1 /* expected_inserts */, + 0 /* expected_failures */); + CheckCacheCountersForCompressionDict( + options, 0 /* expected_compression_dict_misses */, + 1 /* expected_compression_dict_hits */, + 0 /* expected_compression_dict_inserts */); } - ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_EQ(0, NumTableFilesAtLevel(0)); - ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1)); - - // Compression dictionary blocks are preloaded. - CheckCacheCountersForCompressionDict( - options, kNumFiles /* expected_compression_dict_misses */, - 0 /* expected_compression_dict_hits */, - kNumFiles /* expected_compression_dict_inserts */); - - // Seek to a key in a file. It should cause the SST's dictionary meta-block - // to be read. - RecordCacheCounters(options); - RecordCacheCountersForCompressionDict(options); - ReadOptions read_options; - ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1))); - // Two block hits: index and dictionary since they are prefetched - // One block missed/added: data block - CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */, - 1 /* expected_inserts */, 0 /* expected_failures */); - CheckCacheCountersForCompressionDict( - options, 0 /* expected_compression_dict_misses */, - 1 /* expected_compression_dict_hits */, - 0 /* expected_compression_dict_inserts */); } } @@ -1646,7 +1661,7 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { std::string export_files_dir = dbname_ + "/exported"; ExportImportFilesMetaData* metadata_ptr_ = nullptr; Checkpoint* checkpoint; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint)); ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir, &metadata_ptr_)); ASSERT_NE(metadata_ptr_, nullptr); @@ -1683,7 +1698,7 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link. // (Checkpoint not available in LITE mode to test this.) auto db_copy_name = dbname_ + "-copy"; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint)); ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name)); delete checkpoint; diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index edb02920e72d..eb6e51a95ec6 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -137,11 +137,6 @@ class SliceTransformLimitedDomainGeneric : public SliceTransform { // prefix will be x???? return src.size() >= 5; } - - bool InRange(const Slice& dst) const override { - // prefix will be x???? - return dst.size() == 5; - } }; // KeyMayExist can lead to a few false positives, but not false negatives. @@ -710,12 +705,20 @@ class AlwaysTrueBitsBuilder : public FilterBitsBuilder { count_ = 0; // Interpreted as "always true" filter (0 probes over 1 byte of // payload, 5 bytes metadata) - return Slice("\0\0\0\0\0\0", 6); + return Slice("\0\0\0\0\0\0", kAlwaysTrueFilterBytes); } using FilterBitsBuilder::Finish; size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; } + size_t CalculateSpace(size_t /* num_entries */) override { + return kAlwaysTrueFilterBytes; + } + double EstimatedFpRate(size_t /* num_entries */, + size_t /* bytes */) override { + return 1.0; + } private: + static constexpr size_t kAlwaysTrueFilterBytes = 6; size_t count_ = 0; }; @@ -914,14 +917,14 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( std::make_tuple(kAutoBloom, FilterPartitioning::kCoupledPartitionedFilter, - kLatestFormatVersion), + kLatestBbtFormatVersion), std::make_tuple(kAutoBloom, FilterPartitioning::kDecoupledPartitionedFilter, - kLatestFormatVersion), + kLatestBbtFormatVersion), std::make_tuple(kAutoBloom, FilterPartitioning::kUnpartitionedFilter, - kLatestFormatVersion), + kLatestBbtFormatVersion), std::make_tuple(kAutoRibbon, FilterPartitioning::kUnpartitionedFilter, - kLatestFormatVersion))); + kLatestBbtFormatVersion))); #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_F(DBBloomFilterTest, BloomFilterRate) { @@ -2069,11 +2072,6 @@ class SliceTransformLimitedDomain : public SliceTransform { // prefix will be x???? return src.size() >= 5 && src[0] == 'x'; } - - bool InRange(const Slice& dst) const override { - // prefix will be x???? - return dst.size() == 5 && dst[0] == 'x'; - } }; TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter1) { @@ -4137,7 +4135,7 @@ TEST_F(DBBloomFilterTest, SstQueryFilter) { using Keys = std::vector; auto RangeQuery = - [factory, db = db_]( + [factory, db = db_.get()]( std::string lb, std::string ub, std::shared_ptr alt_factory = nullptr) { diff --git a/db/db_compaction_abort_test.cc b/db/db_compaction_abort_test.cc new file mode 100644 index 000000000000..a76e1d689f1f --- /dev/null +++ b/db/db_compaction_abort_test.cc @@ -0,0 +1,993 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include + +#include "db/compaction/compaction_job.h" +#include "db/db_impl/db_impl_secondary.h" +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +// Helper class to manage abort synchronization in tests. +// +// Compaction abort could happen at various stage of compaction. +// To test this, we need to trigger abort at different stage. This requires +// precise control on the timing of abort API invocation. To achieve this in a +// consistent way across various tests, we invoke AbortAllCompactions() within +// the sync point callback, that is added at various stages of compaction. +// However as the abort API is a blocking call, calling it within the sync point +// callback on the compaction thread would cause deadlock. This test helper +// class is designed to solve this challenge. +// +// 1. Abort must happen from a different thread: +// AbortAllCompactions() is typically called from the compaction thread +// via a sync point callback, so that we could precisely control the time of +// API invocation to simulate abort at different stage of compaction. +// However, we can't block the compaction thread waiting for the abort to +// complete - the compaction needs to continue executing to actually check +// the abort flag and exit. So we spawn a separate thread to call +// AbortAllCompactions(). +// +// 2. We need to know when abort completes: +// After compaction returns (with aborted status), we often need to: +// - Verify state (e.g., no output files created) +// - Call ResumeAllCompactions() +// - Run compaction again to verify it succeeds +// We must wait for the abort thread to finish before proceeding, otherwise +// we might call Resume before Abort completes, causing race conditions. +// +// 3. Sync point callbacks may fire multiple times: +// With multiple subcompactions, a callback like +// "CompactionJob::ProcessKeyValueCompaction:Start" fires once per +// subcompaction. We only want to trigger abort once, so we use +// abort_triggered_ as a guard. +// +// 4. Tests may need multiple abort cycles: +// Some tests (e.g., MultipleAbortResumeSequence) do abort->resume->abort +// multiple times. The class supports this by auto-resetting when a +// previous abort has completed. +class AbortSynchronizer { + public: + AbortSynchronizer() : abort_cv_(&abort_mutex_) {} + + ~AbortSynchronizer() { + // Join the thread if it was started - ensures clean shutdown + if (abort_thread_.joinable()) { + abort_thread_.join(); + } + } + + // Non-copyable, non-movable due to thread member + AbortSynchronizer(const AbortSynchronizer&) = delete; + AbortSynchronizer& operator=(const AbortSynchronizer&) = delete; + + // Trigger abort from a separate thread. + // - Safe to call multiple times; only first call in each cycle spawns thread + // - If a previous abort has completed, automatically resets state first + // - The spawned thread calls AbortAllCompactions() and signals completion + void TriggerAbort(DBImpl* db) { + // If previous abort completed, reset state to allow new abort + if (abort_triggered_.load() && abort_completed_.load()) { + Reset(); + } + + if (!abort_triggered_.exchange(true)) { + abort_thread_ = std::thread([this, db]() { + db->AbortAllCompactions(); + SignalAbortCompleted(); + }); + } + } + + // Wait for the abort thread to complete. + // Call this AFTER compaction returns to ensure the abort thread has finished + // before proceeding with Resume or other operations. + void WaitForAbortCompletion() { + MutexLock l(&abort_mutex_); + while (!abort_completed_.load()) { + abort_cv_.Wait(); + } + } + + // Reset state for reuse. Joins any previous thread first. + // Called automatically by TriggerAbort() if previous abort completed, + // but can also be called explicitly for clarity. + void Reset() { + if (abort_thread_.joinable()) { + abort_thread_.join(); + } + abort_triggered_.store(false); + abort_completed_.store(false); + } + + bool IsAbortTriggered() const { return abort_triggered_.load(); } + + private: + void SignalAbortCompleted() { + MutexLock l(&abort_mutex_); + abort_completed_.store(true); + abort_cv_.SignalAll(); + } + + std::atomic abort_triggered_{false}; // Guards against multiple spawns + std::atomic abort_completed_{false}; // Signals thread completion + port::Mutex abort_mutex_; + port::CondVar abort_cv_; + std::thread abort_thread_; // The thread that calls AbortAllCompactions() +}; + +// Helper to clean up SyncPoint state after tests +inline void CleanupSyncPoints() { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Helper class that combines AbortSynchronizer with sync point setup for +// deterministic abort triggering. This adds sync point coordination on top +// of AbortSynchronizer: +// +// This is useful when you need deterministic timing - the callback won't +// return until AbortAllCompactions() has actually set the abort flag, +// guaranteeing the compaction will see it on the next check. +class SyncPointAbortHelper { + public: + explicit SyncPointAbortHelper(const std::string& trigger_point) + : trigger_point_(trigger_point) {} + + // Set up sync points and callbacks. Call this before starting compaction. + void Setup(DBImpl* db_impl) { + db_impl_ = db_impl; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::AbortAllCompactions:FlagSet", kWaitPointName}, + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + trigger_point_, [this](void* /*arg*/) { + // Use AbortSynchronizer to handle the abort in a separate thread + abort_sync_.TriggerAbort(db_impl_); + + // Wait for abort flag to be set via sync point dependency + // This ensures deterministic timing - compaction will see the flag + TEST_SYNC_POINT_CALLBACK(kWaitPointName, nullptr); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + // Wait for the abort to complete. Call this after compaction returns. + void WaitForAbortCompletion() { abort_sync_.WaitForAbortCompletion(); } + + // Clean up sync points and wait for abort completion in one call + void CleanupAndWait() { + CleanupSyncPoints(); + WaitForAbortCompletion(); + } + + private: + static constexpr const char* kWaitPointName = + "SyncPointAbortHelper::WaitForAbort"; + std::string trigger_point_; + DBImpl* db_impl_{nullptr}; + AbortSynchronizer abort_sync_; +}; + +class DBCompactionAbortTest : public DBTestBase { + public: + DBCompactionAbortTest() + : DBTestBase("db_compaction_abort_test", /*env_do_fsync=*/false) {} + + protected: + // Map to track the latest value of each key for verification + std::unordered_map expected_values_; + + // Statistics object for verifying compaction metrics + std::shared_ptr stats_; + + // Get current options with statistics enabled + Options GetOptionsWithStats() { + Options options = CurrentOptions(); + stats_ = CreateDBStatistics(); + options.statistics = stats_; + return options; + } + + // Populate database with test data. + // If overlapping=true, uses the same key range (0 to keys_per_file-1) in each + // file to ensure compaction has work to do. + // If overlapping=false, uses non-overlapping keys across files. + void PopulateData(int num_files, int keys_per_file, int value_size, + bool overlapping = true, int seed = 301) { + Random rnd(seed); + for (int i = 0; i < num_files; ++i) { + for (int j = 0; j < keys_per_file; ++j) { + int key_index = overlapping ? j : (j + i * keys_per_file); + std::string key = Key(key_index); + std::string value = rnd.RandomString(value_size); + ASSERT_OK(Put(key, value)); + expected_values_[key] = value; + } + ASSERT_OK(Flush()); + } + } + + // Verify data integrity by reading all keys and comparing with expected + // values + void VerifyDataIntegrity(int num_keys, int start_key = 0) { + std::string val; + for (int j = start_key; j < start_key + num_keys; ++j) { + std::string key = Key(j); + ASSERT_OK(dbfull()->Get(ReadOptions(), key, &val)); + auto it = expected_values_.find(key); + if (it != expected_values_.end()) { + ASSERT_EQ(it->second, val) << "Value mismatch for key: " << key; + } + } + } + + // Clear expected values (useful when reopening DB or between tests) + void ClearExpectedValues() { expected_values_.clear(); } + + // Run the common abort test pattern with SyncPointAbortHelper: + // 1. Set up sync point abort helper + // 2. Run compaction and verify it's aborted + // 3. Verify COMPACTION_ABORTED stat increased (if stats enabled) + // 4. Clean up, resume, and verify compaction succeeds + // 5. Verify COMPACT_WRITE_BYTES increased (if stats enabled) + void RunSyncPointAbortTest(const std::string& trigger_point, + CompactRangeOptions cro = CompactRangeOptions()) { + // Capture stats and file counts before abort + uint64_t aborted_before = 0; + uint64_t write_bytes_before = 0; + if (stats_) { + aborted_before = stats_->getTickerCount(COMPACTION_ABORTED); + write_bytes_before = stats_->getTickerCount(COMPACT_WRITE_BYTES); + } + + SyncPointAbortHelper helper(trigger_point); + helper.Setup(dbfull()); + + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + // Verify abort was counted + if (stats_) { + uint64_t aborted_after = stats_->getTickerCount(COMPACTION_ABORTED); + ASSERT_GT(aborted_after, aborted_before) + << "COMPACTION_ABORTED stat should increase after abort"; + } + + helper.CleanupAndWait(); + dbfull()->ResumeAllCompactions(); + + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + // Verify compaction completed and wrote bytes + if (stats_) { + uint64_t write_bytes_after = stats_->getTickerCount(COMPACT_WRITE_BYTES); + ASSERT_GT(write_bytes_after, write_bytes_before) + << "COMPACT_WRITE_BYTES should increase after successful compaction"; + } + } +}; + +// Parameterized test for abort with different number of max subcompactions. +// This consolidates tests that were essentially duplicates with different +// max_subcompactions values +class DBCompactionAbortSubcompactionTest + : public DBCompactionAbortTest, + public ::testing::WithParamInterface {}; + +TEST_P(DBCompactionAbortSubcompactionTest, AbortWithVaryingSubcompactions) { + int max_subcompactions = GetParam(); + + Options options = GetOptionsWithStats(); + options.level0_file_num_compaction_trigger = 4; + options.max_subcompactions = max_subcompactions; + options.disable_auto_compactions = true; + Reopen(options); + + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100); + + RunSyncPointAbortTest("CompactionJob::RunSubcompactions:BeforeStart"); + + VerifyDataIntegrity(/*num_keys=*/100); +} + +INSTANTIATE_TEST_CASE_P(SubcompactionVariants, + DBCompactionAbortSubcompactionTest, + ::testing::Values(1, 2, 4), + [](const ::testing::TestParamInfo& param_info) { + return "MaxSubcompactionCount_" + + std::to_string(param_info.param); + }); + +// Parameterized test for abort with different compaction styles +// This consolidates tests for Level, Universal, and FIFO compaction styles +class DBCompactionAbortStyleTest + : public DBCompactionAbortTest, + public ::testing::WithParamInterface { + protected: + // Configure options based on compaction style + void ConfigureOptionsForStyle(Options& options, CompactionStyle style) { + options.compaction_style = style; + options.level0_file_num_compaction_trigger = 4; + options.disable_auto_compactions = true; + + switch (style) { + case kCompactionStyleLevel: + // Level compaction uses default settings + break; + case kCompactionStyleUniversal: + options.compaction_options_universal.size_ratio = 10; + break; + case kCompactionStyleFIFO: + // Set a large max_table_files_size to avoid deletion compaction + options.compaction_options_fifo.max_table_files_size = + 100 * 1024 * 1024; + // Enable intra-L0 compaction which goes through normal compaction path + options.compaction_options_fifo.allow_compaction = true; + options.max_open_files = -1; // Required for FIFO compaction + break; + default: + break; + } + } +}; + +TEST_P(DBCompactionAbortStyleTest, AbortCompaction) { + CompactionStyle style = GetParam(); + + Options options = GetOptionsWithStats(); + options.max_subcompactions = 1; + ConfigureOptionsForStyle(options, style); + Reopen(options); + + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100); + + RunSyncPointAbortTest("CompactionJob::RunSubcompactions:BeforeStart"); + + VerifyDataIntegrity(/*num_keys=*/100); +} + +INSTANTIATE_TEST_CASE_P( + CompactionStyleVariants, DBCompactionAbortStyleTest, + ::testing::Values(kCompactionStyleLevel, kCompactionStyleUniversal, + kCompactionStyleFIFO), + [](const ::testing::TestParamInfo& param_info) { + return OptionsHelper::compaction_style_to_string.at(param_info.param); + }); + +TEST_F(DBCompactionAbortTest, AbortManualCompaction) { + Options options = GetOptionsWithStats(); + options.level0_file_num_compaction_trigger = 10; + options.disable_auto_compactions = true; + Reopen(options); + + PopulateData(/*num_files=*/5, /*keys_per_file=*/100, /*value_size=*/1000); + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + RunSyncPointAbortTest("CompactionJob::ProcessKeyValueCompaction:Start", cro); + + VerifyDataIntegrity(/*num_keys=*/100); +} + +TEST_F(DBCompactionAbortTest, AbortAutomaticCompaction) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_subcompactions = 2; + options.disable_auto_compactions = false; + Reopen(options); + + Random rnd(301); + AbortSynchronizer abort_sync; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction:Start", + [&](void* /*arg*/) { abort_sync.TriggerAbort(dbfull()); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1000))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + CleanupSyncPoints(); + + abort_sync.WaitForAbortCompletion(); + dbfull()->ResumeAllCompactions(); + + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1000))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + std::string val; + for (int j = 0; j < 100; ++j) { + ASSERT_OK(dbfull()->Get(ReadOptions(), Key(j), &val)); + } +} + +TEST_F(DBCompactionAbortTest, AbortAndVerifyNoOutputFiles) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_subcompactions = 2; + options.disable_auto_compactions = true; + Reopen(options); + + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000); + + int num_l0_files_before = NumTableFilesAtLevel(0); + int num_l1_files_before = NumTableFilesAtLevel(1); + + SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start"); + helper.Setup(dbfull()); + + CompactRangeOptions cro; + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + CleanupSyncPoints(); + + int num_l0_files_after = NumTableFilesAtLevel(0); + int num_l1_files_after = NumTableFilesAtLevel(1); + + ASSERT_EQ(num_l0_files_before, num_l0_files_after); + ASSERT_EQ(num_l1_files_before, num_l1_files_after); + + helper.WaitForAbortCompletion(); + dbfull()->ResumeAllCompactions(); + + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + int num_l0_files_final = NumTableFilesAtLevel(0); + int num_l1_files_final = NumTableFilesAtLevel(1); + + ASSERT_EQ(0, num_l0_files_final); + ASSERT_GT(num_l1_files_final, 0); + + VerifyDataIntegrity(/*num_keys=*/100); +} + +TEST_F(DBCompactionAbortTest, MultipleAbortResumeSequence) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_subcompactions = 2; + options.disable_auto_compactions = true; + Reopen(options); + + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000); + + for (int round = 0; round < 3; ++round) { + // Use SyncPointAbortHelper for deterministic abort timing - it waits + // for the abort flag to be set via sync point dependency + SyncPointAbortHelper helper( + "CompactionJob::ProcessKeyValueCompaction:Start"); + helper.Setup(dbfull()); + + CompactRangeOptions cro; + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + helper.CleanupAndWait(); + dbfull()->ResumeAllCompactions(); + } + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + VerifyDataIntegrity(/*num_keys=*/100); +} + +TEST_F(DBCompactionAbortTest, AbortWithOutputFilesCleanup) { + Options options = CurrentOptions(); + options.num_levels = 2; // Ensure compaction output goes to L1 + options.level0_file_num_compaction_trigger = 4; + options.max_subcompactions = 2; + options.disable_auto_compactions = true; + options.target_file_size_base = 50 * 1024; + Reopen(options); + + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100); + + SyncPointAbortHelper helper("CompactionJob::RunSubcompactions:BeforeStart"); + helper.Setup(dbfull()); + + CompactRangeOptions cro; + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + CleanupSyncPoints(); + + int num_l1_files_after_abort = NumTableFilesAtLevel(1); + ASSERT_EQ(0, num_l1_files_after_abort); + + helper.WaitForAbortCompletion(); + dbfull()->ResumeAllCompactions(); + + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + // Verify L0 files are compacted and L1 has output files + int num_l0_files_final = NumTableFilesAtLevel(0); + int num_l1_files_final = NumTableFilesAtLevel(1); + ASSERT_EQ(0, num_l0_files_final) + << "L0 should be empty after successful compaction"; + ASSERT_GT(num_l1_files_final, 0) + << "L1 should have files after successful compaction"; + + VerifyDataIntegrity(/*num_keys=*/100); +} + +TEST_F(DBCompactionAbortTest, NestedAbortResumeCalls) { + // Test that nested AbortAllCompactions() calls work correctly with the + // counter + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_subcompactions = 2; + options.disable_auto_compactions = true; + Reopen(options); + + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000); + + // First abort call + dbfull()->AbortAllCompactions(); + + // Nested abort call (counter should be 2) + dbfull()->AbortAllCompactions(); + + // Compaction should still be blocked after one resume + dbfull()->ResumeAllCompactions(); + + // Compaction should still return aborted because counter is still 1 + CompactRangeOptions cro; + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + // Second resume - counter should be 0 now + dbfull()->ResumeAllCompactions(); + + // Compaction should succeed now + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + VerifyDataIntegrity(/*num_keys=*/100); +} + +TEST_F(DBCompactionAbortTest, AbortCompactFilesAPI) { + // Test that AbortAllCompactions works with CompactFiles API + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 100; // Disable auto compaction + options.disable_auto_compactions = true; + Reopen(options); + + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000); + + // Get the L0 file names + std::vector files_to_compact; + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + for (const auto& file : cf_meta.levels[0].files) { + files_to_compact.push_back(file.name); + } + ASSERT_GE(files_to_compact.size(), 2); + + SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start"); + helper.Setup(dbfull()); + + CompactionOptions compact_options; + Status s = dbfull()->CompactFiles(compact_options, files_to_compact, 1); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + helper.CleanupAndWait(); + dbfull()->ResumeAllCompactions(); + + // CompactFiles should work after resume + ASSERT_OK(dbfull()->CompactFiles(compact_options, files_to_compact, 1)); + + VerifyDataIntegrity(/*num_keys=*/100); +} + +TEST_F(DBCompactionAbortTest, AbortDoesNotAffectFlush) { + // Test that AbortAllCompactions does not affect flush operations + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 100; + options.disable_auto_compactions = true; + Reopen(options); + + Random rnd(301); + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1000))); + } + + // Abort compactions + dbfull()->AbortAllCompactions(); + + // Flush should still work + ASSERT_OK(Flush()); + + // Write more data + for (int j = 100; j < 200; ++j) { + ASSERT_OK(Put(Key(j), rnd.RandomString(1000))); + } + + // Flush should still work + ASSERT_OK(Flush()); + + // Resume compactions + dbfull()->ResumeAllCompactions(); + + VerifyDataIntegrity(/*num_keys=*/200); +} + +TEST_F(DBCompactionAbortTest, AbortBeforeCompactionStarts) { + // Test aborting before any compaction has started + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.disable_auto_compactions = true; + Reopen(options); + + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000); + + // Abort before starting compaction + dbfull()->AbortAllCompactions(); + + // Compaction should immediately return aborted + CompactRangeOptions cro; + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + // Resume + dbfull()->ResumeAllCompactions(); + + // Now compaction should work + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + // Verify L0 files are compacted + ASSERT_EQ(0, NumTableFilesAtLevel(0)); +} + +// Test that in-progress blob and SST files are properly cleaned up when +// compaction is aborted. This specifically tests the case where abort happens +// while files are being written (opened but not yet completed/closed). +// This catches the bug where files exist on disk but are removed from the +// outputs_ vector (e.g., by RemoveLastEmptyOutput when file_size is 0 because +// the builder was abandoned), leaving orphan files. +TEST_F(DBCompactionAbortTest, AbortWithInProgressFileCleanup) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_subcompactions = + 1; // Single subcompaction for deterministic behavior + options.disable_auto_compactions = true; + options.target_file_size_base = 32 * 1024; // 32KB + + // Enable BlobDB with garbage collection to force blob rewriting during + // compaction + options.enable_blob_files = true; + options.min_blob_size = 0; // All values go to blob files + options.blob_file_size = + 1024 * 1024; // 1MB - large enough to not close during test + // Enable blob garbage collection - this forces blob data to be rewritten + // during compaction, creating new blob files + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; // Include all blob files + options.blob_garbage_collection_force_threshold = 0.0; // Always force GC + + Reopen(options); + + // Write enough data to trigger the periodic abort check (every 1000 records). + // 4 files * 2000 keys = 2000 unique overlapping keys processed during + // compaction. The sync point triggers at 999, 1999, etc. + PopulateData(/*num_files=*/4, /*keys_per_file=*/2000, /*value_size=*/500); + + // Helper function to get blob files on disk with their names + auto GetBlobFilesOnDisk = [this]() -> std::vector { + std::vector blob_files; + std::vector files; + EXPECT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& f : files) { + if (f.find(".blob") != std::string::npos) { + blob_files.push_back(f); + } + } + std::sort(blob_files.begin(), blob_files.end()); + return blob_files; + }; + + // Helper function to get blob file count in metadata + auto GetBlobFilesInMetadata = [this]() -> std::vector { + std::vector blob_file_numbers; + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta); + for (const auto& blob_meta : cf_meta.blob_files) { + blob_file_numbers.push_back(blob_meta.blob_file_number); + } + std::sort(blob_file_numbers.begin(), blob_file_numbers.end()); + return blob_file_numbers; + }; + + // Helper function to get SST files on disk + auto GetSstFilesOnDisk = [this]() -> std::vector { + std::vector sst_files; + std::vector files; + EXPECT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& f : files) { + if (f.find(".sst") != std::string::npos) { + sst_files.push_back(f); + } + } + std::sort(sst_files.begin(), sst_files.end()); + return sst_files; + }; + + // Helper function to get SST file numbers in metadata + auto GetSstFilesInMetadata = [this]() -> std::vector { + std::vector sst_file_numbers; + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta); + for (const auto& level : cf_meta.levels) { + for (const auto& file : level.files) { + // Extract file number from the file name (e.g., "000010.sst" -> 10) + uint64_t file_num = 0; + std::string fname = file.name; + // Remove leading path separators if present + size_t pos = fname.rfind('/'); + if (pos != std::string::npos) { + fname = fname.substr(pos + 1); + } + if (sscanf(fname.c_str(), "%" PRIu64, &file_num) == 1) { + sst_file_numbers.push_back(file_num); + } + } + } + std::sort(sst_file_numbers.begin(), sst_file_numbers.end()); + return sst_file_numbers; + }; + + std::vector initial_blob_files = GetBlobFilesOnDisk(); + std::vector initial_meta_blobs = GetBlobFilesInMetadata(); + std::vector initial_sst_files = GetSstFilesOnDisk(); + std::vector initial_meta_ssts = GetSstFilesInMetadata(); + + ASSERT_GT(initial_blob_files.size(), 0u) << "Expected initial blob files"; + ASSERT_EQ(initial_blob_files.size(), initial_meta_blobs.size()) + << "Initial blob files should match between disk and metadata"; + ASSERT_GT(initial_sst_files.size(), 0u) << "Expected initial SST files"; + ASSERT_EQ(initial_sst_files.size(), initial_meta_ssts.size()) + << "Initial SST files should match between disk and metadata"; + + // Tracking variables for blob file lifecycle + std::atomic blob_writes{0}; + std::atomic abort_triggered{false}; + AbortSynchronizer abort_sync; + + // Set up dependency: the wait point will block until FlagSet is hit + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::AbortAllCompactions:FlagSet", + "DBCompactionAbortTest::InProgressBlob:WaitForAbort"}, + }); + + // Trigger abort after some blob writes during compaction output. + // This ensures we have an in-progress blob file when abort happens. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlobFileBuilder::WriteBlobToFile:AddRecord", [&](void* /*arg*/) { + int count = blob_writes.fetch_add(1) + 1; + + // Trigger abort after 100 blob writes - this ensures: + // 1. A blob file has been opened (for writing) + // 2. Some data has been written to it + // 3. But it's not yet completed (blob_file_size is 1MB) + if (count == 100 && !abort_triggered.exchange(true)) { + abort_sync.TriggerAbort(dbfull()); + // Wait for abort flag to be set - this sync point blocks until + // FlagSet is processed + TEST_SYNC_POINT_CALLBACK( + "DBCompactionAbortTest::InProgressBlob:WaitForAbort", nullptr); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Run compaction - it should be aborted while blob file is in-progress + CompactRangeOptions cro; + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + + ASSERT_TRUE(s.IsIncomplete()) + << "Expected compaction to be aborted, got: " << s.ToString(); + + CleanupSyncPoints(); + abort_sync.WaitForAbortCompletion(); + + // Check state after abort + std::vector post_abort_disk_blobs = GetBlobFilesOnDisk(); + std::vector post_abort_meta_blobs = GetBlobFilesInMetadata(); + std::vector post_abort_disk_ssts = GetSstFilesOnDisk(); + std::vector post_abort_meta_ssts = GetSstFilesInMetadata(); + + // This is the key assertion for blob files: files on disk should match + // metadata. If the in-progress blob file was NOT cleaned up, there will be an + // extra file on disk that's not in metadata (orphan). + ASSERT_EQ(post_abort_disk_blobs.size(), post_abort_meta_blobs.size()) + << "Orphan blob file detected! In-progress blob file was not cleaned up " + "after abort. Files on disk: " + << post_abort_disk_blobs.size() + << ", Files in metadata: " << post_abort_meta_blobs.size() + << ". The difference indicates orphaned in-progress blob file(s)."; + + // This is the key assertion for SST files: files on disk should match + // metadata. If the in-progress SST file was NOT cleaned up, there will be an + // extra file on disk that's not in metadata (orphan). + ASSERT_EQ(post_abort_disk_ssts.size(), post_abort_meta_ssts.size()) + << "Orphan SST file detected! In-progress SST file was not cleaned up " + "after abort. Files on disk: " + << post_abort_disk_ssts.size() + << ", Files in metadata: " << post_abort_meta_ssts.size() + << ". The difference indicates orphaned in-progress SST file(s)."; + + // Resume and complete compaction to verify DB is still functional + dbfull()->ResumeAllCompactions(); + + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + // Verify data integrity - we wrote 4 files * 2000 keys with overlapping keys + VerifyDataIntegrity(/*num_keys=*/2000); +} + +TEST_F(DBCompactionAbortTest, AbortBottommostLevelCompaction) { + Options options = CurrentOptions(); + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 1024 * 10; // 10KB + options.max_bytes_for_level_multiplier = 2; + options.disable_auto_compactions = true; + Reopen(options); + + // Write data to fill multiple levels (non-overlapping keys) + PopulateData(/*num_files=*/6, /*keys_per_file=*/100, + /*value_size=*/500, /*overlapping=*/false); + + // First compact to push data to lower levels + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Write more data to L0 (overlapping keys) + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/500); + + SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start"); + helper.Setup(dbfull()); + + // Trigger bottommost level compaction + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + helper.CleanupAndWait(); + dbfull()->ResumeAllCompactions(); + + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + VerifyDataIntegrity(/*num_keys=*/600); +} + +// Test that while compactions are aborted, atomic range replace +// (IngestExternalFiles with atomic_replace_range) works correctly. +// This verifies that the abort state doesn't block other write operations +// like atomic range replace. +TEST_F(DBCompactionAbortTest, AbortThenAtomicRangeReplace) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_subcompactions = 2; + options.disable_auto_compactions = true; + Reopen(options); + + // Create a directory for SST files + std::string sst_files_dir = dbname_ + "_sst_files/"; + ASSERT_OK(env_->CreateDirIfMissing(sst_files_dir)); + + // Populate initial data with overlapping keys + PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/500); + + // Verify initial data + VerifyDataIntegrity(/*num_keys=*/100); + + // Trigger compaction and abort it + SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start"); + helper.Setup(dbfull()); + + CompactRangeOptions cro; + Status s = dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.IsCompactionAborted()); + + helper.CleanupAndWait(); + + // While compaction is still aborted, perform atomic range replace using + // IngestExternalFiles with atomic_replace_range. This verifies that the + // abort state doesn't block other write operations. + // Using RangeOpt() (empty range) means replace everything in the CF. + + // Create an SST file with new data for keys 0-49 (replacing keys 0-99) + std::string sst_file_path = sst_files_dir + "atomic_replace_1.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(sst_file_path)); + + // Write new values for keys 0-49 + Random rnd(42); + std::unordered_map new_values; + for (int j = 0; j < 50; ++j) { + std::string key = Key(j); + std::string value = "replaced_" + rnd.RandomString(100); + ASSERT_OK(sst_file_writer.Put(key, value)); + new_values[key] = value; + } + ASSERT_OK(sst_file_writer.Finish()); + + // Perform atomic range replace for the entire column family. + // Using RangeOpt() (default constructor) means replace everything in the CF. + IngestExternalFileArg arg; + arg.column_family = db_->DefaultColumnFamily(); + arg.external_files = {sst_file_path}; + arg.atomic_replace_range = RangeOpt(); + // snapshot_consistency must be false when using atomic_replace_range + arg.options.snapshot_consistency = false; + + // Atomic range replace should work even while compactions are aborted + ASSERT_OK(db_->IngestExternalFiles({arg})); + + // Now resume compactions after the atomic range replace + dbfull()->ResumeAllCompactions(); + + // Verify that the atomic range replace worked correctly: + // 1. Keys 0-49 should have new replaced values + std::string val; + for (int j = 0; j < 50; ++j) { + std::string key = Key(j); + ASSERT_OK(db_->Get(ReadOptions(), key, &val)); + auto it = new_values.find(key); + ASSERT_NE(it, new_values.end()); + ASSERT_EQ(it->second, val) << "Value mismatch for replaced key: " << key; + } + + // 2. Keys 50-99 should not exist (they were replaced/deleted by atomic + // replace) + for (int j = 50; j < 100; ++j) { + std::string key = Key(j); + Status get_status = db_->Get(ReadOptions(), key, &val); + ASSERT_TRUE(get_status.IsNotFound()) + << "Key " << key << " should not exist after full CF replace"; + } + + // Clean up SST files directory + ASSERT_OK(DestroyDir(env_, sst_files_dir)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index a17a5a6ebe02..a97d3461501a 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -19,6 +19,7 @@ #include "rocksdb/advanced_options.h" #include "rocksdb/concurrent_task_limiter.h" #include "rocksdb/experimental.h" +#include "rocksdb/iostats_context.h" #include "rocksdb/sst_file_writer.h" #include "test_util/mock_time_env.h" #include "test_util/sync_point.h" @@ -74,6 +75,43 @@ class CompactionStatsCollector : public EventListener { std::vector> compaction_completed_; }; +class DeletionTriggeredCompactionWithMinFileSizeTestListener + : public EventListener { + public: + explicit DeletionTriggeredCompactionWithMinFileSizeTestListener( + uint64_t min_file_size) + : min_file_size_(min_file_size) {} + + void OnCompactionBegin(DB* db, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.compaction_reason, + CompactionReason::kFilesMarkedForCompaction); + + auto env = db->GetEnv(); + const std::vector& db_paths = db->GetOptions().db_paths; + for (const auto& file : ci.input_file_infos) { + uint64_t file_size = GetSstFileSize(env, db_paths, file.file_number); + + // Assert that the file size respects the minimum threshold + ASSERT_GE(file_size, min_file_size_); + } + } + + private: + static uint64_t GetSstFileSize(Env* env, const std::vector& db_paths, + uint64_t file_number) { + uint32_t path_id = 0; // since only one path + std::string sst_file_name = TableFileName(db_paths, file_number, path_id); + uint64_t file_size = 0; + Status s = env->GetFileSize(sst_file_name, &file_size); + if (!s.ok()) { + return 0; + } + return file_size; + } + + uint64_t min_file_size_; +}; + class DBCompactionTest : public DBTestBase { public: DBCompactionTest() @@ -127,6 +165,19 @@ class DBCompactionTestWithParam exclusive_manual_compaction_ = std::get<1>(GetParam()); } + class TrivialMoveEventListener : public EventListener { + public: + explicit TrivialMoveEventListener(size_t expected_trivially_moved_files) + : expected_trivially_moved_files_(expected_trivially_moved_files) {} + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.stats.num_input_files_trivially_moved, + expected_trivially_moved_files_); + } + + private: + size_t expected_trivially_moved_files_ = 0; + }; + // Required if inheriting from testing::WithParamInterface<> static void SetUpTestCase() {} static void TearDownTestCase() {} @@ -442,6 +493,72 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) { } } #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_F(DBCompactionTest, UniversalReduceFileLockingRepickNothing) { + const int kFileNumCompactionTrigger = 3; + + Options options = CurrentOptions(); + options.compaction_options_universal.reduce_file_locking = true; + // Set `max_background_jobs` to be 3 to allow low and bottom priority thread + // to run compaction together + options.max_background_jobs = 3; + Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); + options.num_levels = 3; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = kFileNumCompactionTrigger; + options.compaction_options_universal.max_size_amplification_percent = 1; + + DestroyAndReopen(options); + + // Need to get a token to enable compaction parallelism up to + // `max_background_compactions` jobs. + auto pressure_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {// Wait for the full (bottom-priority) compaction to be pre-picked as an + // intent (that is allowing files to be picked by other compactions and + // will pick later when the bottom-priority thread is available to + // execute the compaction) before triggering the low-priority compaction. + {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool", + "LowPriCompaction"}, + // Wait for low-priority compaction to start before + // repicking for the full compaction intent (bottom-priority), enabling + // them to run in parallel. + {"DBImpl::BackgroundCompaction:NonTrivial", + "DBImpl::BGWorkBottomCompaction"}}); + + bool bottom_pri_compaction_attempt_repick = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri", + [&](void* arg) { + bottom_pri_compaction_attempt_repick = true; + Compaction* c = static_cast(arg); + // Verify the intended full compaction for bottom priority thread does + // not get to run (i.e, output to bottommost level) since when it + // repicks its files, some of the the intended input files are already + // compacted by the low priority thread + assert(c == nullptr); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kFileNumCompactionTrigger; ++i) { + if (i == 0) { + ASSERT_OK(Put("file_locked_for_bottom_pri_compaction", "value")); + } else { + ASSERT_OK( + Put("file_not_locked_for_bottom_pri_compaction" + std::to_string(i), + "value")); + } + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT("LowPriCompaction"); + ASSERT_OK(Put("a_new_file_to_pick_for_low_pri_compaction", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(bottom_pri_compaction_attempt_repick); +} TEST_F(DBCompactionTest, SkipStatsUpdateTest) { // This test verify UpdateAccumulatedStats is not on @@ -1292,6 +1409,89 @@ TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) { } while (ChangeOptions()); } +TEST_F(DBCompactionTest, CompactionWithDeletionsAndMinFileSize) { + const uint64_t kMinFileSize = 32 * 1024; // 32KB + const int kDeletionTriggerCount = 50; + const int kInitialKeyCount = 100; + const int kAdditionalKeyCount = 50; + const int kValueSize = 1024; + const int kSmallValueSize = 512; + const int kSeed = 301; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 1024 * 1024; // 1MB + options.level0_file_num_compaction_trigger = 100; + + options.table_properties_collector_factories = { + NewCompactOnDeletionCollectorFactory( + kInitialKeyCount /* sliding window size */, kDeletionTriggerCount, + 0.5 /* deletion ratio */, kMinFileSize)}; + auto listener = + new DeletionTriggeredCompactionWithMinFileSizeTestListener(kMinFileSize); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + Random rnd(kSeed); + + // Create a large file that will be subject to DTC later + for (int i = 0; i < kInitialKeyCount; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + + std::vector initial_metadata; + db_->GetLiveFilesMetaData(&initial_metadata); + ASSERT_EQ(initial_metadata.size(), 1); + + // Create small files that should not trigger compaction + ASSERT_OK(Put("small_file_key1", rnd.RandomString(kSmallValueSize))); + ASSERT_OK(Put("small_file_key2", rnd.RandomString(kSmallValueSize))); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("small_file_key1")); + ASSERT_OK(Flush()); + + // Create a file with enough deletions and size to trigger DTC + // Delete keys from the large file to reach deletion threshold + for (int i = 0; i < kDeletionTriggerCount; i++) { + ASSERT_OK(Delete(Key(i))); + } + + // Add new keys to ensure the deletion file meets the min_file_size threshold + for (int i = kInitialKeyCount; i < kInitialKeyCount + kAdditionalKeyCount; + i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify file count after compaction + ASSERT_EQ(NumTableFilesAtLevel(0), 2); // Small file and deletion file + ASSERT_EQ(NumTableFilesAtLevel(1), 1); // Compacted large file + + // Verify deleted keys are gone + for (int i = 0; i < kDeletionTriggerCount; i++) { + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } + + // Verify non-deleted keys from large file are still accessible + for (int i = kDeletionTriggerCount; i < kInitialKeyCount; i++) { + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + ASSERT_EQ(value.size(), kValueSize); + } + + // Verify new keys are accessible + for (int i = kInitialKeyCount; i < kInitialKeyCount + kAdditionalKeyCount; + i++) { + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + ASSERT_EQ(value.size(), kValueSize); + } +} + TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) { int32_t trivial_move = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -1301,6 +1501,9 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) { Options options = CurrentOptions(); options.write_buffer_size = 100000000; + TrivialMoveEventListener* trivial_move_listener = + new TrivialMoveEventListener(1 /*expected_trivially_moved_files*/); + options.listeners.emplace_back(trivial_move_listener); options.max_subcompactions = max_subcompactions_; DestroyAndReopen(options); @@ -1361,6 +1564,10 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { Options options = CurrentOptions(); options.disable_auto_compactions = true; + // 8 is number of `ranges` that each is a non overlapping file. + TrivialMoveEventListener* trivial_move_listener = + new TrivialMoveEventListener(8 /*expected_trivially_moved_files*/); + options.listeners.emplace_back(trivial_move_listener); options.write_buffer_size = 10 * 1024 * 1024; options.max_subcompactions = max_subcompactions_; @@ -1408,6 +1615,11 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { trivial_move = 0; non_trivial_move = 0; values.clear(); + options.listeners.clear(); + // Same ranges of files, but now overlapping, trivial move not applicable. + TrivialMoveEventListener* trivial_move_listener2 = + new TrivialMoveEventListener(0 /*expected_trivially_moved_files*/); + options.listeners.emplace_back(trivial_move_listener2); DestroyAndReopen(options); // Same ranges as above but overlapping ranges = { @@ -1455,6 +1667,11 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) { Options options = CurrentOptions(); options.disable_auto_compactions = true; + // Two non overlapping files in L0 trivialy moved: + // file 1 [0 => 300], file 2 [600 => 700] + TrivialMoveEventListener* trivial_move_listener1 = + new TrivialMoveEventListener(2 /*expected_trivially_moved_files*/); + options.listeners.emplace_back(trivial_move_listener1); options.write_buffer_size = 10 * 1024 * 1024; options.num_levels = 7; options.max_subcompactions = max_subcompactions_; @@ -1991,7 +2208,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) { std::string end_string = Key(2000); Slice begin(begin_string); Slice end(end_string); - ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + ASSERT_OK( + DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end)); int32_t deleted_count = 0; for (int32_t i = 0; i < 4300; i++) { @@ -2012,8 +2230,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) { Slice begin1(begin_string); Slice end1(end_string); // Try deleting files in range which contain no keys - ASSERT_OK( - DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1)); + ASSERT_OK(DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin1, + &end1)); // Push data from level 0 to level 1 to force all data to be deleted // Note that we don't delete level 0 files @@ -2022,8 +2240,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) { ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_OK( - DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr)); + ASSERT_OK(DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), nullptr, + nullptr)); int32_t deleted_count2 = 0; for (int32_t i = 0; i < 4300; i++) { @@ -2087,14 +2305,11 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) { auto begin_str1 = Key(0), end_str1 = Key(100); auto begin_str2 = Key(100), end_str2 = Key(200); auto begin_str3 = Key(200), end_str3 = Key(299); - Slice begin1(begin_str1), end1(end_str1); - Slice begin2(begin_str2), end2(end_str2); - Slice begin3(begin_str3), end3(end_str3); - std::vector ranges; - ranges.emplace_back(&begin1, &end1); - ranges.emplace_back(&begin2, &end2); - ranges.emplace_back(&begin3, &end3); - ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), + std::vector ranges; + ranges.emplace_back(begin_str1, end_str1); + ranges.emplace_back(begin_str2, end_str2); + ranges.emplace_back(begin_str3, end_str3); + ASSERT_OK(DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(), ranges.data(), ranges.size())); ASSERT_EQ("0,3,7", FilesPerLevel(0)); @@ -2121,7 +2336,7 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) { ranges.emplace_back(&begin1, &end1); ranges.emplace_back(&begin2, &end2); ranges.emplace_back(&begin3, &end3); - ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), + ASSERT_OK(DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(), ranges.data(), ranges.size(), false)); ASSERT_EQ("0,1,4", FilesPerLevel(0)); @@ -2141,8 +2356,9 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) { // Delete all files. { - RangePtr range; - ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1)); + RangeOpt range; + ASSERT_OK( + DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(), &range, 1)); ASSERT_EQ("", FilesPerLevel(0)); for (auto i = 0; i < 1000; i++) { @@ -2204,7 +2420,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRangeFileEndpointsOverlapBug) { // "1 -> vals[0]" to reappear. std::string begin_str = Key(0), end_str = Key(1); Slice begin = begin_str, end = end_str; - ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + ASSERT_OK( + DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end)); ASSERT_EQ(vals[1], GetValue(Key(1))); db_->ReleaseSnapshot(snapshot); @@ -2797,46 +3014,99 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) { } TEST_F(DBCompactionTest, ManualAutoRace) { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"}, - {"DBImpl::RunManualCompaction:WaitScheduled", - "BackgroundCallCompaction:0"}}); + const int kNumL0FilesTrigger = 4; + // Verify that the auto compaction is retried after the conflicting exclusive + // manual compaction finishes for: + // 1. Non-bottom-priority compactions (tested with level compaction) + // 2. Bottom-priority compactions (tested with universal compaction) + for (auto compaction_style : + {kCompactionStyleLevel, kCompactionStyleUniversal}) { + Env::Default()->SetBackgroundThreads( + compaction_style == kCompactionStyleUniversal ? 2 : 0, + Env::Priority::BOTTOM); + for (auto universal_reduce_file_locking : {false, true}) { + if (compaction_style != kCompactionStyleUniversal && + universal_reduce_file_locking) { + continue; + } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Options options = CurrentOptions(); + options.num_levels = 3; + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + options.compaction_style = compaction_style; + options.compaction_options_universal.reduce_file_locking = + universal_reduce_file_locking; - ASSERT_OK(Put(1, "foo", "")); - ASSERT_OK(Put(1, "bar", "")); - ASSERT_OK(Flush(1)); - ASSERT_OK(Put(1, "foo", "")); - ASSERT_OK(Put(1, "bar", "")); - // Generate four files in CF 0, which should trigger an auto compaction - ASSERT_OK(Put("foo", "")); - ASSERT_OK(Put("bar", "")); - ASSERT_OK(Flush()); - ASSERT_OK(Put("foo", "")); - ASSERT_OK(Put("bar", "")); - ASSERT_OK(Flush()); - ASSERT_OK(Put("foo", "")); - ASSERT_OK(Put("bar", "")); - ASSERT_OK(Flush()); - ASSERT_OK(Put("foo", "")); - ASSERT_OK(Put("bar", "")); - ASSERT_OK(Flush()); + DestroyAndReopen(options); + CreateAndReopenWithCF({"exclusive_manual_compaction_cf"}, options); - // The auto compaction is scheduled but waited until here - TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1"); - // The auto compaction will wait until the manual compaction is registerd - // before processing so that it will be cancelled. - CompactRangeOptions cro; - cro.exclusive_manual_compaction = true; - ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr)); - ASSERT_EQ("0,1", FilesPerLevel(1)); + // Set up sync points to ensure that the auto compaction + // encounters a conflict from exclusive manual compaction before the auto + // compaction gets to pick files, This will trigger a retry later. + // + // Specifically, the sync points are set up as following: + // 1. Wait until background low-pri scheduled (not picking files yet) or + // bottom-pri scheduled (not repicking files yet) for + // `universal_reduce_file_locking = true` before triggering + // CompactRange() + // + // 2. Wait until the triggered CompactRange() + // registers its compaction and creates conflict before the auto + // compaction picks or repicks files for the background compaction. + if (compaction_style == kCompactionStyleLevel || + !universal_reduce_file_locking) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"}, + {"DBImpl::RunManualCompaction:WaitScheduled", + "BackgroundCallCompaction:0"}}); + } else { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction:ForwardToBottomPriPool", + "DBCompactionTest::ManualAutoRace:1"}, + {"DBImpl::RunManualCompaction:WaitScheduled", + "BackgroundCallCompaction:0:BottomPri"}}); + } - // Eventually the cancelled compaction will be rescheduled and executed. - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_EQ("0,1", FilesPerLevel(0)); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + bool encounter_conflict = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction()::Conflict", + [&](void* /*arg*/) { encounter_conflict = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Generate files in CF 1 for exclusive CompactRange() + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); + // Generate files in CF0 to trigger full compaction + for (int i = 0; i < kNumL0FilesTrigger; ++i) { + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1"); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr)); + ASSERT_EQ(compaction_style == kCompactionStyleLevel ? "0,1" : "0,0,1", + FilesPerLevel(1)); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(encounter_conflict); + + // Verify that the auto compaction is eventually executed after the + // exclusive CompactRange() finishes. + ASSERT_EQ(compaction_style == kCompactionStyleLevel ? "0,1" : "0,0,1", + FilesPerLevel(0)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + } + Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); + } } TEST_P(DBCompactionTestWithParam, ManualCompaction) { @@ -3390,7 +3660,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) { GenerateNewRandomFile(&rnd, /* nowait */ true); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + ASSERT_OK(experimental::SuggestCompactRange(db_.get(), nullptr, nullptr)); for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; num++) { GenerateNewRandomFile(&rnd, /* nowait */ true); @@ -3959,41 +4229,51 @@ TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) { TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) { const int kNumFilesTrigger = 3; Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); - for (bool use_universal_compaction : {false, true}) { - Options options = CurrentOptions(); - if (use_universal_compaction) { - options.compaction_style = kCompactionStyleUniversal; - } else { - options.compaction_style = kCompactionStyleLevel; - options.level_compaction_dynamic_level_bytes = true; - } - options.num_levels = 4; - options.write_buffer_size = 100 << 10; // 100KB - options.target_file_size_base = 32 << 10; // 32KB - options.level0_file_num_compaction_trigger = kNumFilesTrigger; - // Trigger compaction if size amplification exceeds 110% - options.compaction_options_universal.max_size_amplification_percent = 110; - DestroyAndReopen(options); + for (auto compaction_style : + {kCompactionStyleLevel, kCompactionStyleUniversal}) { + for (auto universal_reduce_file_locking : {false, true}) { + if (compaction_style != kCompactionStyleUniversal && + universal_reduce_file_locking) { + continue; + } + Options options = CurrentOptions(); + options.compaction_style = compaction_style; + if (compaction_style == kCompactionStyleLevel) { + options.level_compaction_dynamic_level_bytes = true; + } else { + options.compaction_options_universal.reduce_file_locking = + universal_reduce_file_locking; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = + 110; + } + options.num_levels = 4; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; - int num_bottom_pri_compactions = 0; - SyncPoint::GetInstance()->SetCallBack( - "DBImpl::BGWorkBottomCompaction", - [&](void* /*arg*/) { ++num_bottom_pri_compactions; }); - SyncPoint::GetInstance()->EnableProcessing(); + DestroyAndReopen(options); - Random rnd(301); - for (int num = 0; num < kNumFilesTrigger; num++) { - ASSERT_EQ(NumSortedRuns(), num); - int key_idx = 0; - GenerateNewFile(&rnd, &key_idx); - } - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + int num_bottom_pri_compactions = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkBottomCompaction", + [&](void* /*arg*/) { ++num_bottom_pri_compactions; }); + SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_EQ(1, num_bottom_pri_compactions); + Random rnd(301); + for (int num = 0; num < kNumFilesTrigger; num++) { + ASSERT_EQ(NumSortedRuns(), num); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); - // Verify that size amplification did occur - ASSERT_EQ(NumSortedRuns(), 1); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(1, num_bottom_pri_compactions); + + // Verify that size amplification did occur + ASSERT_EQ(NumSortedRuns(), 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } } Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); } @@ -4256,7 +4536,8 @@ TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) { std::string end_string = Key(kMaxKey + 1); Slice begin(begin_string); Slice end(end_string); - ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + ASSERT_OK( + DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end)); SyncPoint::GetInstance()->DisableProcessing(); } @@ -5912,6 +6193,9 @@ TEST_F(DBCompactionTest, SubcompactionEvent) { ASSERT_EQ(running_compactions_.find(ci.job_id), running_compactions_.end()); running_compactions_.emplace(ci.job_id, std::unordered_set()); + if (expected_num_l0_files_pre_compaction_ != -1) { + ASSERT_EQ(expected_num_l0_files_pre_compaction_, ci.num_l0_files); + } } void OnCompactionCompleted(DB* /*db*/, @@ -5921,6 +6205,9 @@ TEST_F(DBCompactionTest, SubcompactionEvent) { ASSERT_NE(it, running_compactions_.end()); ASSERT_EQ(it->second.size(), 0); running_compactions_.erase(it); + if (expected_num_l0_files_post_compaction_ != -1) { + ASSERT_EQ(expected_num_l0_files_post_compaction_, ci.num_l0_files); + } } void OnSubcompactionBegin(const SubcompactionJobInfo& si) override { @@ -5950,10 +6237,25 @@ TEST_F(DBCompactionTest, SubcompactionEvent) { return total_subcompaction_cnt_; } + void SetExpectedNumL0FilesPreCompaction(int num) { + expected_num_l0_files_pre_compaction_ = num; + } + + void SetExpectedNumL0FilesPostCompaction(int num) { + expected_num_l0_files_post_compaction_ = num; + } + + void ResetExpectedNumL0Files() { + SetExpectedNumL0FilesPreCompaction(-1); + SetExpectedNumL0FilesPostCompaction(-1); + } + private: InstrumentedMutex mutex_; std::unordered_map> running_compactions_; size_t total_subcompaction_cnt_ = 0; + int expected_num_l0_files_pre_compaction_ = -1; + int expected_num_l0_files_post_compaction_ = -1; }; Options options = CurrentOptions(); @@ -5973,6 +6275,7 @@ TEST_F(DBCompactionTest, SubcompactionEvent) { ASSERT_OK(Flush()); } MoveFilesToLevel(2); + ASSERT_EQ(FilesPerLevel(), "0,0,4"); // generate 2 files @ L1 which overlaps with L2 files for (int i = 0; i < 2; i++) { @@ -5982,11 +6285,18 @@ TEST_F(DBCompactionTest, SubcompactionEvent) { } ASSERT_OK(Flush()); } + listener->SetExpectedNumL0FilesPreCompaction(2 /* num */); + listener->SetExpectedNumL0FilesPostCompaction(0 /* num */); + MoveFilesToLevel(1); ASSERT_EQ(FilesPerLevel(), "0,2,4"); + listener->ResetExpectedNumL0Files(); + CompactRangeOptions comp_opts; comp_opts.max_subcompactions = 4; + + listener->SetExpectedNumL0FilesPreCompaction(0 /* num */); Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr); ASSERT_OK(s); ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -5994,6 +6304,8 @@ TEST_F(DBCompactionTest, SubcompactionEvent) { ASSERT_EQ(listener->GetRunningCompactionCount(), 0); // and sub compaction is triggered ASSERT_GT(listener->GetTotalSubcompactionCount(), 0); + + listener->ResetExpectedNumL0Files(); } TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) { @@ -6561,7 +6873,11 @@ INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken, RoundRobinSubcompactionsAgainstPressureToken, testing::Bool()); -TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) { +// FIXME: the test is flaky and failing the assertion +// ASSERT_EQ(actual_reserved_threads, expected_reserved_threads); +// It's likely a test set up issue, fix if we are to use RoubdRobin compaction. +TEST_P(RoundRobinSubcompactionsAgainstResources, + DISABLED_SubcompactionsUsingResources) { const int kKeysPerBuffer = 200; Options options = CurrentOptions(); options.num_levels = 4; @@ -6576,7 +6892,7 @@ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) { // compaction is enough to make post-compaction L1 size less than // the maximum size (this test assumes only one round-robin compaction // is triggered by kLevelMaxLevelSize) - options.max_compaction_bytes = 100000000; + options.max_compaction_bytes = std::numeric_limits::max(); DestroyAndReopen(options); env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW); @@ -6609,41 +6925,33 @@ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) { // More than 10 files are selected for round-robin under auto // compaction. The number of planned subcompaction is restricted by // the minimum number between available threads and compaction limits - ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions, - std::min(total_low_pri_threads_, max_compaction_limits_) - 1); + auto actual_reserved_threads = + num_planned_subcompactions - options.max_subcompactions; + auto expected_reserved_threads = + std::min(total_low_pri_threads_, max_compaction_limits_) - 1; + ASSERT_EQ(actual_reserved_threads, expected_reserved_threads); num_planned_subcompactions_verified = true; }); - SyncPoint::GetInstance()->LoadDependency( - {{"RoundRobinSubcompactionsAgainstResources:0", - "BackgroundCallCompaction:0"}, - {"CompactionJob::AcquireSubcompactionResources:0", - "RoundRobinSubcompactionsAgainstResources:1"}, - {"RoundRobinSubcompactionsAgainstResources:2", - "CompactionJob::AcquireSubcompactionResources:1"}, - {"CompactionJob::ReleaseSubcompactionResources:0", - "RoundRobinSubcompactionsAgainstResources:3"}, - {"RoundRobinSubcompactionsAgainstResources:4", - "CompactionJob::ReleaseSubcompactionResources:1"}}); + + int acquire_count = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::AcquireSubcompactionResources:0", + [&](void* /*arg*/) { acquire_count++; }); + int release_count = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ReleaseSubcompactionResources", + [&](void* /*arg*/) { release_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); - TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0"); - TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1"); auto pressure_token = dbfull()->TEST_write_controler().GetCompactionPressureToken(); - - TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2"); - TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3"); - // We can reserve more threads now except one is being used - ASSERT_EQ(total_low_pri_threads_ - 1, - env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW)); - ASSERT_EQ( - total_low_pri_threads_ - 1, - env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW)); - TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4"); + ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(num_planned_subcompactions_verified); + ASSERT_EQ(acquire_count, release_count); + SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); } @@ -6825,6 +7133,70 @@ TEST_F(DBCompactionTest, PartialManualCompaction) { ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); } +TEST_F(DBCompactionTest, ConcurrentFIFOPickingSameFileBug) { + Options opts = CurrentOptions(); + opts.compaction_style = CompactionStyle::kCompactionStyleLevel; + opts.num_levels = 3; + opts.disable_auto_compactions = true; + opts.max_background_jobs = 3; + + DestroyAndReopen(opts); + + ASSERT_OK(Put("k1", "v1")); + ASSERT_OK(Flush()); + + // Create a non-L0 SST file for multi-level FIFO size-based compaction later + MoveFilesToLevel(2); + + Options opts_new(opts); + opts_new.compaction_style = CompactionStyle::kCompactionStyleFIFO; + opts_new.max_open_files = -1; + // Set a low threshold to trigger multi-level size-based compaction + opts_new.compaction_options_fifo.max_table_files_size = 1; + + Reopen(opts_new); + + const CompactRangeOptions cro; + const Slice begin_key("k1"); + const Slice end_key("k2"); + + std::unique_ptr concurrent_compaction; + + bool within_first_compaction = true; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifestStart", [&](void* /*arg*/) { + if (!within_first_compaction) { + return; + } + within_first_compaction = false; + + // To allow the second/concurrent compaction to still see the non-L0 + // SST file and coerce the bug of picking that file + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::BackgroundCompaction:BeforeCompaction", + "VersionSet::LogAndApply:WriteManifest"}, + }); + + concurrent_compaction.reset(new port::Thread([&]() { + // Before the fix, the second CompactRange() will either fail the + // assertion of double file picking `being_compacted != + // inputs_[i][j]->being_compacted` in debug mode or cause LSM shape + // corruption "Cannot delete table file XXX from level 2 since it is + // not in the LSM tree" in release mode + Status s = db_->CompactRange(cro, &begin_key, &end_key); + ASSERT_OK(s); + })); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + Status s = db_->CompactRange(cro, &begin_key, &end_key); + SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_OK(s); + + concurrent_compaction->join(); +} + TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) { // Regression test for bug where manual compaction hangs forever when the DB // is in read-only mode. Verify it now at least returns, despite failing. @@ -7472,7 +7844,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest { options_.level0_file_num_compaction_trigger = 3; CompactionOptionsFIFO fifo_options; - if (compaction_path_to_test == "FindIntraL0Compaction" || + if (compaction_path_to_test == "PickCostBasedIntraL0Compaction" || compaction_path_to_test == "CompactRange") { fifo_options.allow_compaction = true; } else if (compaction_path_to_test == "CompactFile") { @@ -7572,7 +7944,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest { void SetupSyncPoints(const std::string& compaction_path_to_test) { compaction_path_sync_point_called_.store(false); - if (compaction_path_to_test == "FindIntraL0Compaction" && + if (compaction_path_to_test == "PickCostBasedIntraL0Compaction" && options_.compaction_style == CompactionStyle::kCompactionStyleLevel) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "PostPickFileToCompact", [&](void* arg) { @@ -7582,7 +7954,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest { *picked_file_to_compact = false; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "FindIntraL0Compaction", [&](void* /*arg*/) { + "PickCostBasedIntraL0Compaction", [&](void* /*arg*/) { compaction_path_sync_point_called_.store(true); }); @@ -7618,12 +7990,12 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest { "PickDeleteTriggeredCompactionReturnNonnullptr", [&](void* /*arg*/) { compaction_path_sync_point_called_.store(true); }); - } else if ((compaction_path_to_test == "FindIntraL0Compaction" || + } else if ((compaction_path_to_test == "PickCostBasedIntraL0Compaction" || compaction_path_to_test == "CompactRange") && options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "FindIntraL0Compaction", [&](void* /*arg*/) { + "PickCostBasedIntraL0Compaction", [&](void* /*arg*/) { compaction_path_sync_point_called_.store(true); }); } @@ -7695,7 +8067,7 @@ TEST_F(DBCompactionTest, CompactFilesSupportKeyPlacementRangeConflict) { ASSERT_OK(Flush()); ASSERT_OK(Put("k4", "v")); ASSERT_OK(Flush()); - ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 1)); + ASSERT_OK(experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily(), 1)); ASSERT_EQ("0,2,1", FilesPerLevel()); ASSERT_OK(Put("k2", "v")); @@ -7783,7 +8155,7 @@ TEST_F(DBCompactionTestL0FilesMisorderCorruption, IngestOneKeyValue(dbfull(), Key(i), "new", options_); } - SetupSyncPoints("FindIntraL0Compaction"); + SetupSyncPoints("PickCostBasedIntraL0Compaction"); ResumeCompactionThread(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -7916,7 +8288,8 @@ TEST_F(DBCompactionTestL0FilesMisorderCorruption, TEST_F(DBCompactionTestL0FilesMisorderCorruption, FlushAfterIntraL0FIFOCompactionWithIngestedFile) { - for (const std::string compaction_path_to_test : {"FindIntraL0Compaction"}) { + for (const std::string compaction_path_to_test : + {"PickCostBasedIntraL0Compaction"}) { SetupOptions(CompactionStyle::kCompactionStyleFIFO, compaction_path_to_test); DestroyAndReopen(options_); @@ -9376,105 +9749,393 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { } TEST_F(DBCompactionTest, FIFOChangeTemperature) { - for (bool write_time_default : {false, true}) { - SCOPED_TRACE("write time default? " + std::to_string(write_time_default)); + for (bool should_allow_trivial_copy : {false, true}) { + for (bool write_time_default : {false, true}) { + int32_t before_compaction_calls = 0; + int32_t after_compaction_calls = 0; + if (should_allow_trivial_copy) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TriviaCopyBeforeCompaction", + [&](void*) { ++before_compaction_calls; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TriviaCopyAfterCompaction", + [&](void*) { ++after_compaction_calls; }); + } else { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:BeforeCompaction", + [&](void*) { ++before_compaction_calls; }); - Options options = CurrentOptions(); - options.compaction_style = kCompactionStyleFIFO; - options.num_levels = 1; - options.max_open_files = -1; - options.level0_file_num_compaction_trigger = 2; - options.create_if_missing = true; - CompactionOptionsFIFO fifo_options; - fifo_options.file_temperature_age_thresholds = {{Temperature::kCold, 1000}}; - fifo_options.max_table_files_size = 100000000; - options.compaction_options_fifo = fifo_options; - env_->SetMockSleep(); - if (write_time_default) { - options.default_write_temperature = Temperature::kWarm; - } - // Should be ignored (TODO: fail?) - options.last_level_temperature = Temperature::kHot; - Reopen(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:AfterCompaction", + [&](void*) { ++after_compaction_calls; }); + } - int total_cold = 0; - int total_warm = 0; - int total_hot = 0; - int total_unknown = 0; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "NewWritableFile::FileOptions.temperature", [&](void* arg) { - Temperature temperature = *(static_cast(arg)); - if (temperature == Temperature::kCold) { - total_cold++; - } else if (temperature == Temperature::kWarm) { - total_warm++; - } else if (temperature == Temperature::kHot) { + SCOPED_TRACE("write time default? " + std::to_string(write_time_default)); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleFIFO; + options.num_levels = 1; + options.max_open_files = -1; + options.level0_file_num_compaction_trigger = 2; + options.create_if_missing = true; + CompactionOptionsFIFO fifo_options; + fifo_options.file_temperature_age_thresholds = { + {Temperature::kCold, 1000}}; + fifo_options.max_table_files_size = 100000000; + fifo_options.allow_trivial_copy_when_change_temperature = + should_allow_trivial_copy; + fifo_options.trivial_copy_buffer_size = 4096; + options.compaction_options_fifo = fifo_options; + env_->SetMockSleep(); + if (write_time_default) { + options.default_write_temperature = Temperature::kWarm; + } + // Should be ignored (TODO: fail?) + options.last_level_temperature = Temperature::kHot; + Reopen(options); + + int total_cold = 0; + int total_warm = 0; + int total_hot = 0; + int total_ice = 0; + int total_unknown = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile::FileOptions.temperature", [&](void* arg) { + Temperature temperature = *(static_cast(arg)); + if (temperature == Temperature::kCold) { + total_cold++; + } else if (temperature == Temperature::kWarm) { + total_warm++; + } else if (temperature == Temperature::kHot) { + total_hot++; + } else if (temperature == Temperature::kIce) { + total_ice++; + } else { + assert(temperature == Temperature::kUnknown); + total_unknown++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + // First two L0 files both become eligible for temperature change + // compaction They should be compacted one-by-one. + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(1200); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + if (write_time_default) { + // Also test dynamic option change + ASSERT_OK(db_->SetOptions({{"default_write_temperature", "kHot"}})); + } + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(4, metadata.file_count); + if (write_time_default) { + ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[1].temperature); + // Includes obsolete/deleted files moved to cold + ASSERT_EQ(total_warm, 3); + ASSERT_EQ(total_hot, 1); + // Includes non-SST DB files + ASSERT_GT(total_unknown, 0); + } else { + ASSERT_EQ(Temperature::kUnknown, + metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kUnknown, + metadata.levels[0].files[1].temperature); + ASSERT_EQ(total_warm, 0); + ASSERT_EQ(total_hot, 0); + // Includes non-SST DB files + ASSERT_GT(total_unknown, 4); + } + ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature); + ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature); + ASSERT_EQ(2, total_cold); + + ASSERT_EQ(2, before_compaction_calls); + ASSERT_EQ(2, after_compaction_calls); + + Destroy(options); + } + } +} + +using TemperatureSet = SmallEnumSet; +static void VerifyTemperatureFileReadStats(const Statistics& st, + TemperatureSet temps) { + SCOPED_TRACE("Temp set size = " + std::to_string(temps.count())); + constexpr uint64_t min_bytes = 100; + constexpr uint64_t min_count = 1; + + IOStatsContext* iostats = get_iostats_context(); + if (temps.Contains(Temperature::kHot)) { + EXPECT_GE(st.getTickerCount(HOT_FILE_READ_BYTES), min_bytes); + EXPECT_GE(st.getTickerCount(HOT_FILE_READ_COUNT), min_count); + EXPECT_GE(iostats->file_io_stats_by_temperature.hot_file_bytes_read, + min_bytes); + EXPECT_GE(iostats->file_io_stats_by_temperature.hot_file_read_count, + min_count); + + } else { + EXPECT_EQ(st.getTickerCount(HOT_FILE_READ_BYTES), 0); + EXPECT_EQ(st.getTickerCount(HOT_FILE_READ_COUNT), 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + } + + if (temps.Contains(Temperature::kWarm)) { + EXPECT_GE(st.getTickerCount(WARM_FILE_READ_BYTES), min_bytes); + EXPECT_GE(st.getTickerCount(WARM_FILE_READ_COUNT), min_count); + EXPECT_GE(iostats->file_io_stats_by_temperature.warm_file_bytes_read, + min_bytes); + EXPECT_GE(iostats->file_io_stats_by_temperature.warm_file_read_count, + min_count); + } else { + EXPECT_EQ(st.getTickerCount(WARM_FILE_READ_BYTES), 0); + EXPECT_EQ(st.getTickerCount(WARM_FILE_READ_COUNT), 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + } + + if (temps.Contains(Temperature::kCool)) { + EXPECT_GE(st.getTickerCount(COOL_FILE_READ_BYTES), min_bytes); + EXPECT_GE(st.getTickerCount(COOL_FILE_READ_COUNT), min_count); + EXPECT_GE(iostats->file_io_stats_by_temperature.cool_file_bytes_read, + min_bytes); + EXPECT_GE(iostats->file_io_stats_by_temperature.cool_file_read_count, + min_count); + } else { + EXPECT_EQ(st.getTickerCount(COOL_FILE_READ_BYTES), 0); + EXPECT_EQ(st.getTickerCount(COOL_FILE_READ_COUNT), 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.cool_file_bytes_read, 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.cool_file_read_count, 0); + } + + if (temps.Contains(Temperature::kCold)) { + EXPECT_GE(st.getTickerCount(COLD_FILE_READ_BYTES), min_bytes); + EXPECT_GE(st.getTickerCount(COLD_FILE_READ_COUNT), min_count); + EXPECT_GE(iostats->file_io_stats_by_temperature.cold_file_bytes_read, + min_bytes); + EXPECT_GE(iostats->file_io_stats_by_temperature.cold_file_read_count, + min_count); + } else { + EXPECT_EQ(st.getTickerCount(COLD_FILE_READ_BYTES), 0); + EXPECT_EQ(st.getTickerCount(COLD_FILE_READ_COUNT), 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0); + } + + if (temps.Contains(Temperature::kIce)) { + EXPECT_GE(st.getTickerCount(ICE_FILE_READ_BYTES), min_bytes); + EXPECT_GE(st.getTickerCount(ICE_FILE_READ_COUNT), min_count); + EXPECT_GE(iostats->file_io_stats_by_temperature.ice_file_bytes_read, + min_bytes); + EXPECT_GE(iostats->file_io_stats_by_temperature.ice_file_read_count, + min_count); + } else { + EXPECT_EQ(st.getTickerCount(ICE_FILE_READ_BYTES), 0); + EXPECT_EQ(st.getTickerCount(ICE_FILE_READ_COUNT), 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.ice_file_bytes_read, 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.ice_file_read_count, 0); + } +} + +TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) { + // Test multi-tier aging: Hot -> Warm -> Cool -> Cold -> Ice + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleFIFO; + options.num_levels = 1; + options.max_open_files = -1; + options.level0_file_num_compaction_trigger = 2; + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.no_block_cache = true; // Simplify statistics + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + CompactionOptionsFIFO fifo_options; + // Multi-tier aging: files age through multiple temperatures + fifo_options.file_temperature_age_thresholds = { + {Temperature::kWarm, 500}, // Hot -> Warm after 500s + {Temperature::kCool, 1000}, // Warm -> Cool + {Temperature::kCold, 1500}, // Cool -> Cold + {Temperature::kIce, 2000} // Cold -> Ice + }; + fifo_options.max_table_files_size = 100000000; + fifo_options.allow_trivial_copy_when_change_temperature = true; + options.compaction_options_fifo = fifo_options; + options.default_write_temperature = Temperature::kHot; + + Reopen(options); + env_->SetMockSleep(); + + // Track all temperature file creations + int total_hot = 0, total_warm = 0, total_cool = 0, total_cold = 0, + total_ice = 0, total_unknown = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile::FileOptions.temperature", [&](void* arg) { + Temperature temperature = *(static_cast(arg)); + switch (temperature) { + case Temperature::kHot: total_hot++; - } else { - assert(temperature == Temperature::kUnknown); + break; + case Temperature::kWarm: + total_warm++; + break; + case Temperature::kCool: + total_cool++; + break; + case Temperature::kCold: + total_cold++; + break; + case Temperature::kIce: + total_ice++; + break; + case Temperature::kUnknown: total_unknown++; - } - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + break; + default: + break; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - // The file system does not support checksum handoff. The check - // will be ignored. - ASSERT_OK(Put(Key(0), "value1")); - env_->MockSleepForSeconds(800); - ASSERT_OK(Put(Key(2), "value2")); + // Create initial three files (will start as Hot), enough to ensure key + // range filtering will be applied in FilePicker::GetNextFile() with one + // more file + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put(Key(0), Random::GetTLSInstance()->RandomBinaryString(100))); ASSERT_OK(Flush()); + } - ASSERT_OK(Put(Key(0), "value1")); - ASSERT_OK(Put(Key(2), "value2")); - ASSERT_OK(Flush()); + // Test reading from Hot temperature file + ASSERT_OK(options.statistics->Reset()); + get_iostats_context()->Reset(); - // First two L0 files both become eligible for temperature change compaction - // They should be compacted one-by-one. - ASSERT_OK(Put(Key(0), "value1")); - env_->MockSleepForSeconds(1200); - ASSERT_OK(Put(Key(2), "value2")); - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(100U, Get(Key(0)).size()); - if (write_time_default) { - // Also test dynamic option change - ASSERT_OK(db_->SetOptions({{"default_write_temperature", "kHot"}})); - } + VerifyTemperatureFileReadStats(*options.statistics, Temperature::kHot); - ASSERT_OK(Put(Key(0), "value1")); - env_->MockSleepForSeconds(800); - ASSERT_OK(Put(Key(2), "value2")); - ASSERT_OK(Flush()); + // Land well into each time interval + env_->MockSleepForSeconds(100); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Age initial files to warm + env_->MockSleepForSeconds(500); + ASSERT_OK(Put(Key(1), Random::GetTLSInstance()->RandomBinaryString(101))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + // Test reading from Warm temperature file (the aged file) + ASSERT_OK(options.statistics->Reset()); + get_iostats_context()->Reset(); - ColumnFamilyMetaData metadata; - db_->GetColumnFamilyMetaData(&metadata); - ASSERT_EQ(4, metadata.file_count); - if (write_time_default) { - ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature); - ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[1].temperature); - // Includes obsolete/deleted files moved to cold - ASSERT_EQ(total_warm, 3); - ASSERT_EQ(total_hot, 1); - // Includes non-SST DB files - ASSERT_GT(total_unknown, 0); - } else { - ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); - ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature); - ASSERT_EQ(total_warm, 0); - ASSERT_EQ(total_hot, 0); - // Includes non-SST DB files - ASSERT_GT(total_unknown, 4); - } - ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature); - ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature); - ASSERT_EQ(2, total_cold); + ASSERT_EQ(100U, Get(Key(0)).size()); - Destroy(options); + // Verify Warm file statistics + VerifyTemperatureFileReadStats(*options.statistics, Temperature::kWarm); + + // Age initial files to cool + env_->MockSleepForSeconds(500); + ASSERT_OK(Put(Key(2), Random::GetTLSInstance()->RandomBinaryString(102))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Test reading from Cool temperature file (the aged file) + ASSERT_OK(options.statistics->Reset()); + get_iostats_context()->Reset(); + + ASSERT_EQ(100U, Get(Key(0)).size()); + + VerifyTemperatureFileReadStats(*options.statistics, Temperature::kCool); + + // Age initial files to cold + env_->MockSleepForSeconds(500); + ASSERT_OK(Put(Key(3), Random::GetTLSInstance()->RandomBinaryString(103))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Test reading from Cold temperature file (the aged file) + ASSERT_OK(options.statistics->Reset()); + get_iostats_context()->Reset(); + + ASSERT_EQ(100U, Get(Key(0)).size()); + + VerifyTemperatureFileReadStats(*options.statistics, Temperature::kCold); + + // Age initial files to ice + env_->MockSleepForSeconds(500); + ASSERT_OK(Put(Key(4), Random::GetTLSInstance()->RandomBinaryString(104))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Test reading from Ice temperature file (the aged file) + ASSERT_OK(options.statistics->Reset()); + get_iostats_context()->Reset(); + + ASSERT_EQ(100U, Get(Key(0)).size()); + + VerifyTemperatureFileReadStats(*options.statistics, Temperature::kIce); + + // Verify temperature progression in metadata + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + + // Should have files at different temperatures + std::map temp_counts; + for (const auto& file : metadata.levels[0].files) { + temp_counts[file.temperature]++; } + + // Verify current files temperatures + EXPECT_EQ(temp_counts[Temperature::kHot], 1); + EXPECT_EQ(temp_counts[Temperature::kWarm], 1); + EXPECT_EQ(temp_counts[Temperature::kCool], 1); + EXPECT_EQ(temp_counts[Temperature::kCold], 1); + EXPECT_EQ(temp_counts[Temperature::kIce], 3); + + // Verify historical (and current) file temperatures + EXPECT_EQ(total_hot, 7); + EXPECT_EQ(total_warm, 6); + EXPECT_EQ(total_cool, 5); + EXPECT_EQ(total_cold, 4); + EXPECT_EQ(total_ice, 3); + + // Final comprehensive test: read from all temperature files + Reopen(options); + ASSERT_OK(options.statistics->Reset()); + get_iostats_context()->Reset(); + + // Read from all files to verify cumulative statistics + for (int i = 0; i < 5; i++) { + ASSERT_EQ(static_cast(100 + i), Get(Key(i)).size()); + } + + VerifyTemperatureFileReadStats(*options.statistics, TemperatureSet::All()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBCompactionTest, DisableMultiManualCompaction) { @@ -9918,55 +10579,60 @@ TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) { env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = kNumL0Files; - options.num_levels = kNumLevels; - DestroyAndReopen(options); + for (bool universal_reduce_file_locking : {false, true}) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.num_levels = kNumLevels; + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.reduce_file_locking = + universal_reduce_file_locking; + DestroyAndReopen(options); - // Setup last level to be non-empty since it's a bit unclear whether - // compaction to an empty level would be considered "bottommost". - ASSERT_OK(Put(Key(0), "val")); - ASSERT_OK(Flush()); - MoveFilesToLevel(kNumLevels - 1); + // Setup last level to be non-empty since it's a bit unclear whether + // compaction to an empty level would be considered "bottommost". + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(kNumLevels - 1); - SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::BGWorkBottomCompaction", - "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" - "PreTriggerCompaction"}, - {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" - "PostTriggerCompaction", - "BackgroundCallCompaction:0"}}); - SyncPoint::GetInstance()->EnableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkBottomCompaction", + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PreTriggerCompaction"}, + {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PostTriggerCompaction", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); - port::Thread compact_range_thread([&] { - CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; - cro.exclusive_manual_compaction = false; - ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); - }); + port::Thread compact_range_thread([&] { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.exclusive_manual_compaction = false; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); - // Sleep in the low-pri thread so any newly scheduled compaction will be - // queued. Otherwise it might finish before we check its existence. - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - sleeping_task_low.WaitUntilSleeping(); + // Sleep in the low-pri thread so any newly scheduled compaction will be + // queued. Otherwise it might finish before we check its existence. + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_low, Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); - TEST_SYNC_POINT( - "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" - "PreTriggerCompaction"); - for (int i = 0; i < kNumL0Files; ++i) { - ASSERT_OK(Put(Key(0), "val")); - ASSERT_OK(Flush()); - } - ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); - TEST_SYNC_POINT( - "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" - "PostTriggerCompaction"); + TEST_SYNC_POINT( + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PreTriggerCompaction"); + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + TEST_SYNC_POINT( + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PostTriggerCompaction"); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); - compact_range_thread.join(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + compact_range_thread.join(); + } } TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) { @@ -10472,7 +11138,7 @@ TEST_F(DBCompactionTest, NumberOfSubcompactions) { } } -TEST_F(DBCompactionTest, VerifyRecordCount) { +TEST_F(DBCompactionTest, VerifyInputRecordCount) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleLevel; options.level0_file_num_compaction_trigger = 3; @@ -10510,6 +11176,103 @@ TEST_F(DBCompactionTest, VerifyRecordCount) { ASSERT_TRUE(std::strstr(s.getState(), expect)); } +TEST_F(DBCompactionTest, VerifyOutputRecordCountBlockBasedTable) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + options.compaction_verify_record_count = true; + DestroyAndReopen(options); + Random rnd(301); + + // Create 2 overlapping L0 files + for (int i = 1; i < 20; i += 2) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), Key(10), Key(15))); + + for (int i = 0; i < 20; i += 2) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + // Skip adding every 7th key in the output table + int num_iter = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::Add::skip", [&](void* skip) { + num_iter++; + if (num_iter % 7 == 0) { + *(bool*)skip = true; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.IsCorruption()); + const char* expect = + "Number of keys in compaction output SST files does not match number of " + "keys added."; + ASSERT_TRUE(std::strstr(s.getState(), expect)); +} + +TEST_F(DBCompactionTest, VerifyOutputRecordCountPlainTable) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + options.compaction_verify_record_count = true; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = 2; + plain_table_options.hash_table_ratio = 0.8; + plain_table_options.index_sparseness = 3; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPrefix; + plain_table_options.full_scan_mode = false; + plain_table_options.store_index_in_file = false; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true)); + + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.allow_mmap_reads = false; + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + + DestroyAndReopen(options); + Random rnd(301); + + // Create 2 overlapping L0 files + for (int i = 1; i < 20; i += 2) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + for (int i = 0; i < 20; i += 2) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + // Skip adding every 7th key in the output table + int num_iter = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PlainTableBuilder::Add::skip", [&](void* skip) { + num_iter++; + if (num_iter % 7 == 0) { + *(bool*)skip = true; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.IsCorruption()); + const char* expect = + "Number of keys in compaction output SST files does not match number of " + "keys added."; + ASSERT_TRUE(std::strstr(s.getState(), expect)); +} + TEST_F(DBCompactionTest, ErrorWhenReadFileHead) { // This is to test a bug that is fixed in // https://github.com/facebook/rocksdb/pull/11782. @@ -10782,6 +11545,124 @@ TEST_F(DBCompactionTest, RecordNewestKeyTimeForTtlCompaction) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); } + +// Test verifies compaction file cutting logic when using tail size estimation +// maintains output files at or below the target file size. +TEST_F(DBCompactionTest, CompactionRespectsTargetSizeWithTailEstimation) { + const int kInitialKeyCount = 10000; // 10k keys + const int kValueSize = 100; // 100 bytes per key + const int kSeed = 301; + + Options options = CurrentOptions(); + options.target_file_size_is_upper_bound = true; + options.target_file_size_base = 256 * 1024; + options.write_buffer_size = 2 * 1024 * 1024; + options.level0_file_num_compaction_trigger = 100; // Never trigger L0->L1 + options.compression = kNoCompression; + + BlockBasedTableOptions table_options; + table_options.partition_filters = true; + table_options.metadata_block_size = 4 * 1024; + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + // Generate 2 L0 files + // Generate first file with 10k keys (each ~100 bytes) approx 1.2MB total + Random rnd(kSeed); + for (int i = 0; i < kInitialKeyCount; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + + // Generate second file with overlapping keys to force compaction (prevent + // trivial move) + for (int i = kInitialKeyCount / 2; i < kInitialKeyCount * 1.5; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + + // Capture file metadata and assert two L0 files + std::vector file_metadata; + db_->GetLiveFilesMetaData(&file_metadata); + ASSERT_EQ(file_metadata.size(), 2); + for (const auto& file : file_metadata) { + ASSERT_EQ(file.level, 0); + }; + + // Manually compact LO files to L1 + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify that compacted output files are under target file size + for (const auto& file : file_metadata) { + if (file.level > 0) { + EXPECT_LE(file.size, options.target_file_size_base) + << "Output file size exceeds target size: " << " File: " << file.name + << " level: " << file.level << " File size: " << file.size + << " Target size: " << options.target_file_size_base; + } + } +} + +class PeriodicCompactionListener : public EventListener { + public: + explicit PeriodicCompactionListener() {} + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + if (ci.compaction_reason == CompactionReason::kPeriodicCompaction) { + ++num_periodic_compactions; + } + } + + std::atomic num_periodic_compactions = 0; +}; + +TEST_F(DBCompactionTest, PeriodicTask) { + // Tests that when no trigger event is fired (flush/compaction/setoptions), + // periodic compaction is still triggered by a scheduled periodic function. + auto mock_clock = std::make_shared(env_->GetSystemClock()); + mock_clock->SetCurrentTime(100); + mock_clock->InstallTimedWaitFixCallback(); + auto mock_env = std::make_unique(env_, mock_clock); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) { + auto periodic_task_scheduler_ptr = + static_cast(arg); + periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock.get()); + }); + + Options options; + options.env = mock_env.get(); + options.compaction_style = kCompactionStyleUniversal; + options.statistics = CreateDBStatistics(); + int kPeriodicCompactionSeconds = 7 * 24 * 60 * 60; // 1 week + options.periodic_compaction_seconds = kPeriodicCompactionSeconds; + options.num_levels = 50; + auto listener = std::make_shared(); + options.listeners.push_back(listener); + ASSERT_OK(TryReopen(options)); + + Random* rnd = Random::GetTLSInstance(); + for (int k = 0; k < 10; ++k) { + ASSERT_OK(Put(Key(k), rnd->RandomString(100))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(49)); + + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock->MockSleepForSeconds(kPeriodicCompactionSeconds + 1); }); + ASSERT_OK(db_->WaitForCompact({})); + + ASSERT_EQ(listener->num_periodic_compactions, 1); + Close(); +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc index 1d17e5d9bbd1..7967719888bb 100644 --- a/db/db_encryption_test.cc +++ b/db/db_encryption_test.cc @@ -17,9 +17,10 @@ class DBEncryptionTest : public DBTestBase { public: DBEncryptionTest() : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {} - Env* GetTargetEnv() { + Env* GetNonEncryptedEnv() { if (encrypted_env_ != nullptr) { - return (static_cast(encrypted_env_))->target(); + return (static_cast_with_check(encrypted_env_)) + ->env_target(); } else { return env_; } @@ -38,7 +39,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) { auto status = env_->GetChildren(dbname_, &fileNames); ASSERT_OK(status); - Env* target = GetTargetEnv(); + Env* target = GetNonEncryptedEnv(); int hits = 0; for (auto it = fileNames.begin(); it != fileNames.end(); ++it) { if (*it == "LOCK") { @@ -89,7 +90,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) { } TEST_F(DBEncryptionTest, ReadEmptyFile) { - auto defaultEnv = GetTargetEnv(); + auto defaultEnv = GetNonEncryptedEnv(); // create empty file for reading it back in later auto envOptions = EnvOptions(CurrentOptions()); @@ -116,6 +117,40 @@ TEST_F(DBEncryptionTest, ReadEmptyFile) { ASSERT_TRUE(data.empty()); } +TEST_F(DBEncryptionTest, NotSupportedGetFileSize) { + // Validate envrypted env does not support GetFileSize. + // The goal of the test is to validate the encrypted env/fs does not support + // GetFileSize API on FSRandomAccessFile interface. + // This test combined with the rest of the integration tests validate that + // the new API GetFileSize on FSRandomAccessFile interface is not required to + // be supported for database to work properly. + // The GetFileSize API is used in ReadFooterFromFile() API to get the file + // size. When GetFileSize API is not supported, the ReadFooterFromFile() API + // will use FileSystem GetFileSize API as fallback. Refer to the + // EncryptedRandomAccessFile class definition for more details. + if (!encrypted_env_) { + return; + } + + auto fs = encrypted_env_->GetFileSystem(); + + // create empty file for reading it back in later + auto filePath = dbname_ + "/empty.empty"; + + // Create empty file + CreateFile(fs.get(), filePath, "", false); + + // Open it for reading footer + std::unique_ptr randomAccessFile; + auto status = fs->NewRandomAccessFile(filePath, FileOptions(), + &randomAccessFile, nullptr); + ASSERT_OK(status); + + uint64_t fileSize; + status = randomAccessFile->GetFileSize(&fileSize); + ASSERT_TRUE(status.IsNotSupported()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_etc3_test.cc b/db/db_etc3_test.cc new file mode 100644 index 000000000000..e5152fcd58d2 --- /dev/null +++ b/db/db_etc3_test.cc @@ -0,0 +1,161 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" + +namespace ROCKSDB_NAMESPACE { + +class DBEtc3Test : public DBTestBase { + public: + DBEtc3Test() : DBTestBase("db_etc3_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBEtc3Test, ManifestRollOver) { + do { + Options options; + // Force new manifest on each manifest write + options.max_manifest_file_size = 0; + options.max_manifest_space_amp_pct = 0; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + { + ASSERT_OK(Put(1, "key1", std::string(1000, '1'))); + ASSERT_OK(Put(1, "key2", std::string(1000, '2'))); + ASSERT_OK(Put(1, "key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_OK(Flush(1)); // This should trigger LogAndApply. + uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_GT(manifest_after_flush, manifest_before_flush); + // Re-open should always re-create manifest file + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); + ASSERT_EQ(std::string(1000, '1'), Get(1, "key1")); + ASSERT_EQ(std::string(1000, '2'), Get(1, "key2")); + ASSERT_EQ(std::string(1000, '3'), Get(1, "key3")); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBEtc3Test, AutoTuneManifestSize) { + // Ensure we have auto-tuning beyond max_manifest_file_size by default + ASSERT_EQ(DBOptions{}.max_manifest_space_amp_pct, 500); + + Options options = CurrentOptions(); + ASSERT_OK(db_->SetOptions({{"level0_file_num_compaction_trigger", "20"}})); + + // Use large column family names to essentially control the amount of payload + // data needed for the manifest file. Drop manifest entries don't include the + // CF name so are small. + uint64_t prev_manifest_num = 0, cur_manifest_num = 0; + std::deque handles; + int counter = 5; + auto AddCfFn = [&]() { + std::string name = "cf" + std::to_string(counter++); + name.resize(1000, 'a'); + ASSERT_OK(db_->CreateColumnFamily(options, name, &handles.emplace_back())); + prev_manifest_num = cur_manifest_num; + cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo(); + }; + auto DropCfFn = [&]() { + ASSERT_OK(db_->DropColumnFamily(handles.front())); + ASSERT_OK(db_->DestroyColumnFamilyHandle(handles.front())); + handles.pop_front(); + prev_manifest_num = cur_manifest_num; + cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo(); + }; + auto TrivialManifestWriteFn = [&]() { + ASSERT_OK(Put("x", std::to_string(counter++))); + ASSERT_OK(Flush()); + prev_manifest_num = cur_manifest_num; + cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo(); + }; + + options.max_manifest_file_size = 1000000; + options.max_manifest_space_amp_pct = 0; // no auto-tuning yet + DestroyAndReopen(options); + + // With the generous (minimum) maximum manifest size, should not be rotated + AddCfFn(); + AddCfFn(); + AddCfFn(); + ASSERT_EQ(prev_manifest_num, cur_manifest_num); + + // Change options for small max and (still) no auto-tuning + ASSERT_OK(db_->SetDBOptions({{"max_manifest_file_size", "3000"}})); + + // Takes effect on the next manifest write + TrivialManifestWriteFn(); + ASSERT_LT(prev_manifest_num, cur_manifest_num); + + // Now we have to rewrite the whole manifest on each write because the + // compacted size exceeds the "max" size. + AddCfFn(); + ASSERT_LT(prev_manifest_num, cur_manifest_num); + DropCfFn(); + ASSERT_LT(prev_manifest_num, cur_manifest_num); + AddCfFn(); + ASSERT_LT(prev_manifest_num, cur_manifest_num); + TrivialManifestWriteFn(); + ASSERT_LT(prev_manifest_num, cur_manifest_num); + + // Enabling auto-tuning should fix this, immediately for next manifest writes. + // This will allow up to double-ish the size of the compacted manifest, + // which last should have been 4000 + some bytes. + ASSERT_EQ(handles.size(), 4U); + ASSERT_OK(db_->SetDBOptions({{"max_manifest_space_amp_pct", "105"}})); + + // After 9 CF names should be enough to rotate the manifest + for (int i = 1; i <= 5; ++i) { + if ((i % 2) == 1) { + DropCfFn(); + } + AddCfFn(); + ASSERT_EQ(prev_manifest_num, cur_manifest_num); + } + TrivialManifestWriteFn(); + ASSERT_LT(prev_manifest_num, cur_manifest_num); + + // We now have a different last compacted manifest size, should be + // able to go beyond 9 CFs named in manifest this time. + ASSERT_EQ(handles.size(), 6U); + + DropCfFn(); + DropCfFn(); + for (int i = 1; i <= 4; ++i) { + DropCfFn(); + AddCfFn(); + ASSERT_EQ(prev_manifest_num, cur_manifest_num); + } + // We've written 10 named CFs to the manifest. We should be able to + // dynamically change the auto-tuning still based on the last "compacted" + // manifest size of 7000 + some bytes. + ASSERT_OK(db_->SetDBOptions({{"max_manifest_space_amp_pct", "51"}})); + TrivialManifestWriteFn(); + ASSERT_LT(prev_manifest_num, cur_manifest_num); + // And the "compacted" manifest size has reset again, so should be changed + // again sooner. + ASSERT_EQ(handles.size(), 4U); + for (int i = 1; i <= 2; ++i) { + AddCfFn(); + ASSERT_EQ(prev_manifest_num, cur_manifest_num); + } + // Enough for manifest change + AddCfFn(); + ASSERT_LT(prev_manifest_num, cur_manifest_num); + + // Wrap up + while (!handles.empty()) { + DropCfFn(); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index e9ae7981ae2c..7bf821170031 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -75,11 +75,9 @@ Status DBImpl::GetLiveFiles(std::vector& ret, ret.emplace_back(CurrentFileName("")); ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number())); - // The OPTIONS file number is zero in read-write mode when OPTIONS file - // writing failed and the DB was configured with - // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file - // number is zero when no OPTIONS file exist at all. In those cases we do not - // record any OPTIONS file in the live file list. + // In read-only mode the OPTIONS file number is zero when no OPTIONS file + // exist at all. In this cases we do not record any OPTIONS file in the live + // file list. if (versions_->options_file_number() != 0) { ret.emplace_back(OptionsFileName("", versions_->options_file_number())); } @@ -111,6 +109,7 @@ Status DBImpl::GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos) { { InstrumentedMutexLock l(&mutex_); while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) { + TEST_SYNC_POINT("DBImpl::GetSortedWalFilesImpl:WaitPurge"); bg_cv_.Wait(); } @@ -185,14 +184,14 @@ Status DBImpl::GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos) { return s; } -Status DBImpl::GetCurrentWalFile(std::unique_ptr* current_log_file) { +Status DBImpl::GetCurrentWalFile(std::unique_ptr* current_wal_file) { uint64_t current_logfile_number; { InstrumentedMutexLock l(&mutex_); - current_logfile_number = logfile_number_; + current_logfile_number = cur_wal_number_; } - return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file); + return wal_manager_.GetLiveWalFile(current_logfile_number, current_wal_file); } Status DBImpl::GetLiveFilesStorageInfo( @@ -332,7 +331,7 @@ Status DBImpl::GetLiveFilesStorageInfo( const uint64_t options_size = versions_->options_file_size_; const uint64_t min_log_num = MinLogNumberToKeep(); // Ensure consistency with manifest for track_and_verify_wals_in_manifest - const uint64_t max_log_num = logfile_number_; + const uint64_t max_log_num = cur_wal_number_; mutex_.Unlock(); @@ -369,11 +368,9 @@ Status DBImpl::GetLiveFilesStorageInfo( } } - // The OPTIONS file number is zero in read-write mode when OPTIONS file - // writing failed and the DB was configured with - // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file - // number is zero when no OPTIONS file exist at all. In those cases we do not - // record any OPTIONS file in the live file list. + // In read-only mode the OPTIONS file number is zero when no OPTIONS file + // exist at all. In this cases we do not record any OPTIONS file in the live + // file list. if (options_number != 0) { results.emplace_back(); LiveFileStorageInfo& info = results.back(); diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index b72de9a6886e..e1000c576fd2 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -101,7 +101,7 @@ TEST_F(DBFlushTest, SyncFail) { TEST_SYNC_POINT("DBFlushTest::SyncFail:2"); fault_injection_env->SetFilesystemActive(true); // Now the background job will do the flush; wait for it. - // Returns the IO error happend during flush. + // Returns the IO error happened during flush. ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("", FilesPerLevel()); // flush failed. Destroy(options); @@ -518,11 +518,11 @@ TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) { // Note : one set of delete for KEY1, KEY2, KEY3 is written to // SSTable to propagate the delete operations to K-V pairs // that could have been inserted into the database during past Flush - // opeartions. + // operations. EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -= KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t); - // Additional useful paylaod. + // Additional useful payload. ASSERT_OK(Delete(KEY4)); ASSERT_OK(Delete(KEY5)); ASSERT_OK(Delete(KEY6)); @@ -614,7 +614,7 @@ TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) { // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written // to SSTable to propagate the deleteRange operations to K-V pairs that could - // have been inserted into the database during past Flush opeartions. + // have been inserted into the database during past Flush operations. EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -= (KEY1.size() + KEY2.size() + sizeof(uint64_t)) + (KEY2.size() + KEY3.size() + sizeof(uint64_t)); @@ -709,7 +709,7 @@ class TestFlushListener : public EventListener { // that assumption does not hold (see the test case MultiDBMultiListeners // below). ASSERT_TRUE(test_); - if (db == test_->db_) { + if (db == test_->db_.get()) { std::vector> files_by_level; test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(), &files_by_level); @@ -842,7 +842,7 @@ TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) { ASSERT_OK(Put(1, Key(idx), std::string(1, 'v'))); } - // To coerce a manual flush happenning in the middle of GetLiveFiles's flush, + // To coerce a manual flush happening in the middle of GetLiveFiles's flush, // we need to pause background flush thread and enable it later. std::shared_ptr sleeping_task = std::make_shared(); @@ -851,7 +851,7 @@ TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) { sleeping_task.get(), Env::Priority::HIGH); sleeping_task->WaitUntilSleeping(); - // Coerce a manual flush happenning in the middle of GetLiveFiles's flush + // Coerce a manual flush happening in the middle of GetLiveFiles's flush bool get_live_files_paused_at_sync_point = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::AtomicFlushMemTables:AfterScheduleFlush", [&](void* /* arg */) { @@ -1428,7 +1428,7 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) { Close(); } -// Create a Compaction Fitler that will be invoked +// Create a Compaction Filter that will be invoked // at flush time and will update the value of a KV pair // if the key string is "lower" than the filter_key_ string. class ConditionalUpdateFilter : public CompactionFilter { @@ -2533,7 +2533,7 @@ TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) { ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0")); - ManagedSnapshot snapshot_guard(db_); + ManagedSnapshot snapshot_guard(db_.get()); ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); ASSERT_OK(db_->Flush(FlushOptions(), default_cf)); @@ -2574,7 +2574,7 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED; ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); ASSERT_NE(txn_db, nullptr); - db_ = txn_db; + db_.reset(txn_db); // Create two more columns other than default CF. std::vector cfs = {"puppy", "kitty"}; @@ -2638,9 +2638,8 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST. cfs.push_back(kDefaultColumnFamilyName); ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); - DBImpl* db_impl = static_cast(db_); - ASSERT_TRUE(db_impl->allow_2pc()); - ASSERT_NE(db_impl->MinLogNumberToKeep(), 0); + ASSERT_TRUE(dbfull()->allow_2pc()); + ASSERT_NE(dbfull()->MinLogNumberToKeep(), 0); } TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { @@ -3504,6 +3503,209 @@ TEST_F(DBFlushTest, DBStuckAfterAtomicFlushError) { ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); ASSERT_EQ(1, NumTableFilesAtLevel(0)); } + +TEST_F(DBFlushTest, VerifyOutputRecordCount) { + for (bool use_plain_table : {false, true}) { + Options options = CurrentOptions(); + options.flush_verify_memtable_count = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + // Verify flush output record count verification in different table + // formats + if (use_plain_table) { + options.table_factory.reset(NewPlainTableFactory()); + } + + // Verify that flush output record count verification does not produce false + // positives. + ASSERT_OK(Merge("k0", "v1")); + ASSERT_OK(Put("k1", "v1")); + ASSERT_OK(Put("k2", "v1")); + ASSERT_OK(SingleDelete("k2")); + ASSERT_OK(Delete("k2")); + ASSERT_OK(Delete("k3")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), "k1", "k3")); + ASSERT_OK(Flush()); + + // Verify that flush output record count verification catch corruption + DestroyAndReopen(options); + if (use_plain_table) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PlainTableBuilder::Add::skip", + [&](void* skip) { *(bool*)skip = true; }); + + } else { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::Add::skip", + [&](void* skip) { *(bool*)skip = true; }); + } + SyncPoint::GetInstance()->EnableProcessing(); + const char* expect = + "Number of keys in flush output SST files does not match"; + + // 1. During DB open flush + ASSERT_OK(Put("k1", "v1")); + ASSERT_OK(Put("k2", "v1")); + Status s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), expect)); + + // 2. During regular flush + DestroyAndReopen(options); + ASSERT_OK(Put("k1", "v1")); + ASSERT_OK(Put("k2", "v1")); + s = Flush(); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), expect)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +class DBFlushSuperBlockTest + : public DBFlushTest, + public ::testing::WithParamInterface> { + public: + DBFlushSuperBlockTest() : DBFlushTest() {} + + std::string formatKey(int i) { + int desired_length = 10; + char buffer[64]; + snprintf(buffer, 64, "%0*d", desired_length, i); + return buffer; + } + + void VerifyReadWithGet(int key_count) { + for (int i = 0; i < key_count; ++i) { + PinnableSlice value; + ASSERT_OK(Get(formatKey(i), &value)); + ASSERT_EQ(value.ToString(), added_data[formatKey(i)]); + } + } + + void VerifyReadWithIterator(int key_count) { + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + int i = 0; + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_OK(it->status()); + ASSERT_EQ((it->key()).ToString(), formatKey(i)); + ASSERT_EQ((it->value()).ToString(), added_data[formatKey(i)]); + i++; + } + ASSERT_OK(it->status()); + ASSERT_EQ(i, key_count); + } + } + + protected: + Random rnd{123}; + std::unordered_map added_data; +}; + +constexpr size_t kLowSpaceOverheadRatio = 256; + +TEST_P(DBFlushSuperBlockTest, SuperBlock) { + constexpr int key_count = 12345; + Options options; + options.env = env_; + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + options.paranoid_file_checks = true; + options.write_buffer_size = 1024 * 1024; + BlockBasedTableOptions block_options; + block_options.block_align = get<0>(GetParam()); + block_options.index_block_restart_interval = 3; + block_options.super_block_alignment_size = get<1>(GetParam()); + block_options.super_block_alignment_space_overhead_ratio = get<2>(GetParam()); + options.table_factory.reset(NewBlockBasedTableFactory(block_options)); + if (block_options.block_align) { + // When block align is enabled, disable compression + options.compression = kNoCompression; + } + + ASSERT_OK(options.table_factory->ValidateOptions( + DBOptions(options), ColumnFamilyOptions(options))); + + Reopen(options); + + int super_block_pad_count = 0; + int super_block_pad_exceed_limit_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteMaybeCompressedBlock:" + "SuperBlockAlignment", + [&super_block_pad_count](void* /*arg*/) { super_block_pad_count++; }); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteMaybeCompressedBlock:" + "SuperBlockAlignmentPaddingBytesExceedLimit", + [&super_block_pad_exceed_limit_count](void* /*arg*/) { + super_block_pad_exceed_limit_count++; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Add lots of keys + for (int i = 0; i < key_count; ++i) { + added_data[formatKey(i)] = std::string(rnd.RandomString(rnd.Next() % 1000)); + ASSERT_OK(Put(formatKey(i), added_data[formatKey(i)])); + } + + // flush the data in memory to disk to verify with super block alignment, the + // data could be read back properly + Reopen(options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // When block_align is enabled, super block is always aligned, so there should + // be 0 padding for super block alignment + if (block_options.super_block_alignment_size != 0 && + !block_options.block_align) { + ASSERT_GT(super_block_pad_count, 0); + } else { + ASSERT_EQ(super_block_pad_count, 0); + } + + if (!block_options.block_align && + block_options.super_block_alignment_size != 0 && + block_options.super_block_alignment_space_overhead_ratio == + kLowSpaceOverheadRatio) { + ASSERT_GT(super_block_pad_exceed_limit_count, 0); + } + + // verify the values are correct + VerifyReadWithGet(key_count); + Reopen(options); + VerifyReadWithIterator(key_count); + + // verify checksum + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + + // Reopen options and flip the option of super block configuration, read still + // works. This verifies the forward/backward compatibility + if (block_options.super_block_alignment_size == 0) { + block_options.super_block_alignment_size = 16 * 1024; + } else { + block_options.super_block_alignment_size = 0; + } + options.table_factory.reset(NewBlockBasedTableFactory(block_options)); + + Reopen(options); + + // verify the values are correct + VerifyReadWithGet(key_count); + Reopen(options); + VerifyReadWithIterator(key_count); + + // verify checksum + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); +} + +INSTANTIATE_TEST_CASE_P( + SuperBlockTests, DBFlushSuperBlockTest, + testing::Combine(testing::Bool(), testing::Values(0, 32 * 1024, 16 * 1024), + // Use very low space overhead ratio to test + // the case where required padded bytes is + // larger than the max allowed padding size + testing::Values(4, kLowSpaceOverheadRatio))); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_follower_test.cc b/db/db_follower_test.cc index a0f35a46b619..c032464052c2 100644 --- a/db/db_follower_test.cc +++ b/db/db_follower_test.cc @@ -370,10 +370,10 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) { // This test creates 4 L0 files and compacts them. The follower, during catchup, // successfully instantiates 4 Versions corresponding to the 4 files (but -// donesn't install them yet), followed by deleting those 4 and adding a new +// doesn't install them yet), followed by deleting those 4 and adding a new // file from compaction. The test verifies that the 4 L0 files are deleted // correctly by the follower. -// We use teh Barrier* functions to ensure that the follower first sees the 4 +// We use the Barrier* functions to ensure that the follower first sees the 4 // L0 files and is able to link them, and then sees the compaction that // obsoletes those L0 files (so those L0 files are intermediates that it has // to explicitly delete). Suppose we don't have any barriers, its possible diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 96613dfad050..fea401477cc5 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -74,12 +74,14 @@ #include "options/cf_options.h" #include "options/options_helper.h" #include "options/options_parser.h" +#include "util/udt_util.h" #ifdef ROCKSDB_JEMALLOC #include "port/jemalloc_helper.h" #endif #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -168,7 +170,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, bool read_only) : dbname_(dbname), own_info_log_(options.info_log == nullptr), - init_logger_creation_s_(), initial_db_options_(SanitizeOptions(dbname, options, read_only, &init_logger_creation_s_)), env_(initial_db_options_.env), @@ -184,7 +185,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, immutable_db_options_.use_adaptive_mutex), #endif // COERCE_CONTEXT_SWITCH - default_cf_handle_(nullptr), error_handler_(this, immutable_db_options_, &mutex_), event_logger_(immutable_db_options_.info_log.get()), max_total_in_memory_state_(0), @@ -193,45 +193,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, file_options_, immutable_db_options_)), seq_per_batch_(seq_per_batch), batch_per_txn_(batch_per_txn), - next_job_id_(1), - shutting_down_(false), - reject_new_background_jobs_(false), - db_lock_(nullptr), - manual_compaction_paused_(false), bg_cv_(&mutex_), - logfile_number_(0), - log_dir_synced_(false), - log_empty_(true), - persist_stats_cf_handle_(nullptr), - log_sync_cv_(&log_write_mutex_), - total_log_size_(0), - is_snapshot_supported_(true), + wal_sync_cv_(&wal_write_mutex_), write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), write_thread_(immutable_db_options_), nonmem_write_thread_(immutable_db_options_), write_controller_(mutable_db_options_.delayed_write_rate), - last_batch_group_size_(0), - unscheduled_flushes_(0), - unscheduled_compactions_(0), - bg_bottom_compaction_scheduled_(0), - bg_compaction_scheduled_(0), - num_running_compactions_(0), - bg_flush_scheduled_(0), - num_running_flushes_(0), - bg_purge_scheduled_(0), - disable_delete_obsolete_files_(0), - pending_purge_obsolete_files_(0), delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()), - has_unpersisted_data_(false), - unable_to_release_oldest_log_(false), - num_running_ingest_file_(0), wal_manager_(immutable_db_options_, file_options_, io_tracer_, seq_per_batch), - bg_work_paused_(0), - bg_compaction_paused_(0), - refitting_level_(false), - opened_successfully_(false), - periodic_task_scheduler_(), two_write_queues_(options.two_write_queues), manual_wal_flush_(options.manual_wal_flush), // last_sequencee_ is always maintained by the main queue that also writes @@ -249,14 +219,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, // requires a custom gc for compaction, we use that to set use_custom_gc_ // as well. use_custom_gc_(seq_per_batch), - shutdown_initiated_(false), own_sfm_(options.sst_file_manager == nullptr), - closed_(false), atomic_flush_install_cv_(&mutex_), blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_, &error_handler_, &event_logger_, - immutable_db_options_.listeners, dbname_), - lock_wal_count_(0) { + immutable_db_options_.listeners, dbname_) { // !batch_per_trx_ implies seq_per_batch_ because it is only unset for // WriteUnprepared, which should use seq_per_batch_. assert(batch_per_txn_ || seq_per_batch_); @@ -284,15 +251,17 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog, [this]() { this->FlushInfoLog(); }); periodic_task_functions_.emplace( - PeriodicTaskType::kRecordSeqnoTime, [this]() { - this->RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0); - }); + PeriodicTaskType::kRecordSeqnoTime, + [this]() { this->RecordSeqnoToTimeMapping(); }); + periodic_task_functions_.emplace( + PeriodicTaskType::kTriggerCompaction, + [this]() { this->TriggerPeriodicCompaction(); }); versions_.reset(new VersionSet( - dbname_, &immutable_db_options_, file_options_, table_cache_.get(), - write_buffer_manager_, &write_controller_, &block_cache_tracer_, - io_tracer_, db_id_, db_session_id_, options.daily_offpeak_time_utc, - &error_handler_, read_only)); + dbname_, &immutable_db_options_, mutable_db_options_, file_options_, + table_cache_.get(), write_buffer_manager_, &write_controller_, + &block_cache_tracer_, io_tracer_, db_id_, db_session_id_, + options.daily_offpeak_time_utc, &error_handler_, read_only)); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); @@ -351,6 +320,22 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { WaitForBackgroundWork(); + TEST_SYNC_POINT("DBImpl::ResumeImpl:Start"); + + // With two_write_queues=true, sequence numbers are allocated via + // FetchAddLastAllocatedSequence() before writes complete, but only + // published via SetLastSequence() after success. If we're recovering from + // an error, there may be allocated-but-not-published sequence numbers. + // We must sync last_sequence_ with last_allocated_sequence_ before creating + // any new memtables/WALs, otherwise the new WAL could start with a sequence + // number lower than what was already written, causing "sequence number + // going backwards" corruption on subsequent recovery. + if (immutable_db_options_.two_write_queues) { + versions_->SyncLastSequenceWithAllocated(); + } + + TEST_SYNC_POINT("DBImpl::ResumeImpl:AfterSyncSeq"); + Status s; if (shutdown_initiated_) { // Returning shutdown status to SFM during auto recovery will cause it @@ -636,8 +621,8 @@ Status DBImpl::CloseHelper() { mutex_.Lock(); } { - InstrumentedMutexLock lock(&log_write_mutex_); - for (auto l : logs_to_free_) { + InstrumentedMutexLock lock(&wal_write_mutex_); + for (auto l : wals_to_free_) { delete l; } for (auto& log : logs_) { @@ -821,7 +806,8 @@ Status DBImpl::StartPeriodicTaskScheduler() { Status s = periodic_task_scheduler_.Register( PeriodicTaskType::kDumpStats, periodic_task_functions_.at(PeriodicTaskType::kDumpStats), - mutable_db_options_.stats_dump_period_sec); + mutable_db_options_.stats_dump_period_sec, + /*run_immediately=*/true); if (!s.ok()) { return s; } @@ -830,7 +816,8 @@ Status DBImpl::StartPeriodicTaskScheduler() { Status s = periodic_task_scheduler_.Register( PeriodicTaskType::kPersistStats, periodic_task_functions_.at(PeriodicTaskType::kPersistStats), - mutable_db_options_.stats_persist_period_sec); + mutable_db_options_.stats_persist_period_sec, + /*run_immediately=*/true); if (!s.ok()) { return s; } @@ -838,64 +825,55 @@ Status DBImpl::StartPeriodicTaskScheduler() { Status s = periodic_task_scheduler_.Register( PeriodicTaskType::kFlushInfoLog, - periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog)); + periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog), + /*run_immediately=*/true); + + if (s.ok()) { + s = periodic_task_scheduler_.Register( + PeriodicTaskType::kTriggerCompaction, + periodic_task_functions_.at(PeriodicTaskType::kTriggerCompaction), + /*run_immediately=*/false); + } return s; } -Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options, - const WriteOptions& write_options, - bool is_new_db) { +Status DBImpl::RegisterRecordSeqnoTimeWorker() { options_mutex_.AssertHeld(); - uint64_t min_preserve_seconds = std::numeric_limits::max(); - uint64_t max_preserve_seconds = std::numeric_limits::min(); - std::vector sv_contexts; + // We assume InstallSuperVersionForConfigChange has already ensured suitable + // mappings are present for each relevant CF. We just need to be sure the DB's + // seqno_to_time_mapping_ and worker scheduler are appropriate for the + // combination of CF settings. + + MinAndMaxPreserveSeconds preserve_info; + uint64_t seqno_time_cadence; { InstrumentedMutexLock l(&mutex_); for (auto cfd : *versions_->GetColumnFamilySet()) { auto& mopts = cfd->GetLatestMutableCFOptions(); - // preserve time is the max of 2 options. - uint64_t preserve_seconds = - std::max(mopts.preserve_internal_time_seconds, - mopts.preclude_last_level_data_seconds); - if (!cfd->IsDropped() && preserve_seconds > 0) { - min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds); - max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds); + if (!cfd->IsDropped()) { + preserve_info.Combine(mopts); } } - size_t old_mapping_size = seqno_to_time_mapping_.Size(); - if (min_preserve_seconds == std::numeric_limits::max()) { - // Don't track + seqno_time_cadence = preserve_info.GetRecodingCadence(); + if (seqno_time_cadence == 0) { + // To return as much as possible to the feature being disabled, + // clear the existing mapping seqno_to_time_mapping_.SetCapacity(0); seqno_to_time_mapping_.SetMaxTimeSpan(UINT64_MAX); + assert(seqno_to_time_mapping_.Empty()); } else { uint64_t cap = std::min(kMaxSeqnoToTimeEntries, - max_preserve_seconds * kMaxSeqnoTimePairsPerCF / - min_preserve_seconds); + preserve_info.max_preserve_seconds * + kMaxSeqnoTimePairsPerCF / + preserve_info.min_preserve_seconds); seqno_to_time_mapping_.SetCapacity(cap); - seqno_to_time_mapping_.SetMaxTimeSpan(max_preserve_seconds); - } - if (old_mapping_size != seqno_to_time_mapping_.Size()) { - InstallSeqnoToTimeMappingInSV(&sv_contexts); + seqno_to_time_mapping_.SetMaxTimeSpan(preserve_info.max_preserve_seconds); } } - // clean up outside db mutex - for (SuperVersionContext& sv_context : sv_contexts) { - sv_context.Clean(); - } - sv_contexts.clear(); - - uint64_t seqno_time_cadence = 0; - if (min_preserve_seconds != std::numeric_limits::max()) { - // round up to 1 when the time_duration is smaller than - // kMaxSeqnoTimePairsPerCF - seqno_time_cadence = (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) / - kMaxSeqnoTimePairsPerCF; - } - TEST_SYNC_POINT_CALLBACK( "DBImpl::RegisterRecordSeqnoTimeWorker:BeforePeriodicTaskType", nullptr); @@ -903,68 +881,10 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options, if (seqno_time_cadence == 0) { s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime); } else { - // Before registering the periodic task, we need to be sure to fulfill two - // promises: - // 1) Any DB created with preserve/preclude options set from the beginning - // will get pre-allocated seqnos with pre-populated time mappings back to - // the times we are interested in. (This will enable future import of data - // while preserving rough write time. We can only do this reliably from - // DB::Open, as otherwise there could be a race between CreateColumnFamily - // and the first Write to the DB, and seqno-to-time mappings need to be - // monotonic. - // 2) In any DB, any data written after setting preserve/preclude options - // must have a reasonable time estimate (so that we can accurately place - // the data), which means at least one entry in seqno_to_time_mapping_. - // - // FIXME: We don't currently guarantee that if the first column family with - // that setting is added or configured after initial DB::Open but before - // the first user Write. Fixing this causes complications with the crash - // test because if DB starts without preserve/preclude option, does some - // user writes but all those writes are lost in crash, then re-opens with - // preserve/preclude option, it sees seqno==1 which looks like one of the - // user writes was recovered, when actually it was not. - bool last_seqno_zero = GetLatestSequenceNumber() == 0; - assert(!is_new_db || last_seqno_zero); - if (is_new_db && last_seqno_zero) { - // Pre-allocate seqnos and pre-populate historical mapping - // We can simply modify these, before writes are allowed - constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST; - versions_->SetLastAllocatedSequence(kMax); - versions_->SetLastPublishedSequence(kMax); - versions_->SetLastSequence(kMax); - - // And record in manifest, to avoid going backwards in seqno on re-open - // (potentially with different options). Concurrency is simple because we - // are in DB::Open - { - InstrumentedMutexLock l(&mutex_); - VersionEdit edit; - edit.SetLastSequence(kMax); - s = versions_->LogAndApplyToDefaultColumnFamily( - read_options, write_options, &edit, &mutex_, - directories_.GetDbDir()); - if (!s.ok() && versions_->io_status().IsIOError()) { - error_handler_.SetBGError(versions_->io_status(), - BackgroundErrorReason::kManifestWrite); - } - } - - // Pre-populate mappings for reserved sequence numbers. - RecordSeqnoToTimeMapping(max_preserve_seconds); - } else { - if (!last_seqno_zero) { - // Ensure at least one mapping (or log a warning), and - // an updated entry whenever relevant SetOptions is called - RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0); - } else { - // FIXME (see limitation described above) - } - } - s = periodic_task_scheduler_.Register( PeriodicTaskType::kRecordSeqnoTime, periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime), - seqno_time_cadence); + seqno_time_cadence, /*run_immediately=*/true); } return s; @@ -1165,7 +1085,7 @@ void DBImpl::DumpStats() { { InstrumentedMutexLock l(&mutex_); for (auto cfd : versions_->GetRefedColumnFamilySet()) { - if (!cfd->initialized()) { + if (!cfd->initialized() || cfd->IsDropped()) { continue; } @@ -1255,11 +1175,11 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { mutex_.AssertHeld(); - if (!job_context->logs_to_free.empty()) { - for (auto l : job_context->logs_to_free) { + if (!job_context->wals_to_free.empty()) { + for (auto l : job_context->wals_to_free) { AddToLogsToFreeQueue(l); } - job_context->logs_to_free.clear(); + job_context->wals_to_free.clear(); } } @@ -1273,23 +1193,38 @@ FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { } Status DBImpl::SetOptions( - ColumnFamilyHandle* column_family, - const std::unordered_map& options_map) { + const std::unordered_map>& + column_families_opts_map) { // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; const WriteOptions write_options; - auto* cfd = - static_cast_with_check(column_family)->cfd(); - if (options_map.empty()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "SetOptions() on column family [%s], empty input", - cfd->GetName().c_str()); - return Status::InvalidArgument("empty input"); + if (column_families_opts_map.empty()) { + return Status::OK(); + } + + for (const auto& cf_opts : column_families_opts_map) { + if (cf_opts.second.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetOptions() on column family [%s], empty input", + cf_opts.first->GetName().c_str()); + return Status::InvalidArgument("empty input"); + } + } + + autovector*>> + column_family_datas; + for (const auto& cf_opts : column_families_opts_map) { + column_family_datas.push_back( + {static_cast_with_check(cf_opts.first)->cfd(), + &cf_opts.second}); } InstrumentedMutexLock ol(&options_mutex_); - MutableCFOptions new_options_copy; // For logging outside of DB mutex + autovector + new_options_copy; // For logging outside of DB mutex Status s; Status persist_options_status; SuperVersionContext sv_context(/* create_superversion */ true); @@ -1309,73 +1244,107 @@ Status DBImpl::SetOptions( // // (b) Append a new Version without manifest write nor DB mutex release // - // Thus aren't releasing the DB mutex again until the end of this block, - // after installing the new SuperVersion. - auto pre_cb = [&]() -> Status { - Status cb_s = cfd->SetOptions(db_options, options_map); - if (cb_s.ok()) { - new_options_copy = cfd->GetLatestMutableCFOptions(); - } - return cb_s; - }; + // Thus aren't releasing the DB mutex from LogAndApply calling pre_cb, + // through installing the new Version until the end of this block, after + // installing the new SuperVersion. VersionEdit dummy_edit; dummy_edit.MarkNoManifestWriteDummy(); TEST_SYNC_POINT_CALLBACK("DBImpl::SetOptions:dummy_edit", &dummy_edit); - s = versions_->LogAndApply( - cfd, read_options, write_options, &dummy_edit, &mutex_, - directories_.GetDbDir(), false /*new_descriptor_log=*/, - nullptr /*new_opts*/, {} /*manifest_wcb*/, pre_cb); - if (!versions_->io_status().ok()) { - assert(!s.ok()); - error_handler_.SetBGError(versions_->io_status(), - BackgroundErrorReason::kManifestWrite); + for (const auto& cfd_opts : column_family_datas) { + auto* cfd = cfd_opts.first; + const auto* options_map_ptr = cfd_opts.second; + auto pre_cb = [&]() -> Status { + Status cb_s = cfd->SetOptions(db_options, *options_map_ptr); + if (cb_s.ok()) { + new_options_copy.emplace_back(cfd->GetLatestMutableCFOptions()); + } + return cb_s; + }; + + s = versions_->LogAndApply( + cfd, read_options, write_options, &dummy_edit, &mutex_, + directories_.GetDbDir(), false /*new_descriptor_log=*/, + nullptr /*new_opts*/, {} /*manifest_wcb*/, pre_cb); + if (!versions_->io_status().ok()) { + assert(!s.ok()); + error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + if (!s.ok()) { + break; + } } if (s.ok()) { // Trigger possible flush/compactions. This has to be before we persist // options to file, otherwise there will be a deadlock with writer // thread. - InstallSuperVersionAndScheduleWork(cfd, &sv_context); + for (const auto& cfd_opts : column_family_datas) { + InstallSuperVersionForConfigChange(cfd_opts.first, &sv_context); + } persist_options_status = WriteOptionsFile(write_options, true /*db_mutex_already_held*/); bg_cv_.SignalAll(); -#if __cplusplus >= 202002L - assert(new_options_copy == cfd->GetLatestMutableCFOptions()); - assert(cfd->GetLatestMutableCFOptions() == - cfd->GetCurrentMutableCFOptions()); - assert(cfd->GetCurrentMutableCFOptions() == - cfd->current()->GetMutableCFOptions()); +#ifndef NDEBUG + for (size_t i = 0; i < column_family_datas.size(); ++i) { + auto* cfd = column_family_datas[i].first; + assert(new_options_copy[i] == cfd->GetLatestMutableCFOptions()); + assert(cfd->GetLatestMutableCFOptions() == + cfd->GetCurrentMutableCFOptions()); + assert(cfd->GetCurrentMutableCFOptions() == + cfd->current()->GetMutableCFOptions()); + } #endif } } sv_context.Clean(); - if (s.ok() && (options_map.count("preserve_internal_time_seconds") > 0 || - options_map.count("preclude_last_level_data_seconds") > 0)) { - s = RegisterRecordSeqnoTimeWorker(read_options, write_options, - false /* is_new_db*/); + if (s.ok()) { + bool needs_seqno_worker = false; + for (const auto& cf_opts : column_families_opts_map) { + if (cf_opts.second.count("preserve_internal_time_seconds") > 0 || + cf_opts.second.count("preclude_last_level_data_seconds") > 0) { + needs_seqno_worker = true; + break; + } + } + if (needs_seqno_worker) { + s = RegisterRecordSeqnoTimeWorker(); + } } - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str()); - for (const auto& o : options_map) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), - o.second.c_str()); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "SetOptions() on [%zu] column families, inputs:", + column_family_datas.size()); + for (size_t i = 0; i < column_family_datas.size(); ++i) { + const auto* cfd = column_family_datas[i].first; + const auto* options_map_ptr = column_family_datas[i].second; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Set options on column family [%s] (%zu/%zu), inputs:", + cfd->GetName().c_str(), i, column_family_datas.size()); + for (const auto& o : *options_map_ptr) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", + o.first.c_str(), o.second.c_str()); + } } if (s.ok()) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[%s] SetOptions() succeeded", cfd->GetName().c_str()); - new_options_copy.Dump(immutable_db_options_.info_log.get()); + for (size_t i = 0; i < column_family_datas.size(); ++i) { + const auto* cfd = column_family_datas[i].first; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Set options on column family [%s] (%zu/%zu) succeeded, " + "updated CF options:", + cfd->GetName().c_str(), i, column_family_datas.size()); + new_options_copy[i].Dump(immutable_db_options_.info_log.get()); + } if (!persist_options_status.ok()) { // NOTE: WriteOptionsFile already logs on failure s = persist_options_status; } } else { persist_options_status.PermitUncheckedError(); // less important - ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed", - cfd->GetName().c_str()); + ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetOptions() failed: %s", + s.ToString().c_str()); } LogFlush(immutable_db_options_.info_log); return s; @@ -1474,7 +1443,7 @@ Status DBImpl::SetDBOptions( s = periodic_task_scheduler_.Register( PeriodicTaskType::kDumpStats, periodic_task_functions_.at(PeriodicTaskType::kDumpStats), - new_options.stats_dump_period_sec); + new_options.stats_dump_period_sec, /*run_immediately=*/true); } if (new_options.max_total_wal_size != mutable_db_options_.max_total_wal_size) { @@ -1489,7 +1458,7 @@ Status DBImpl::SetDBOptions( s = periodic_task_scheduler_.Register( PeriodicTaskType::kPersistStats, periodic_task_functions_.at(PeriodicTaskType::kPersistStats), - new_options.stats_persist_period_sec); + new_options.stats_persist_period_sec, /*run_immediately=*/true); } } mutex_.Lock(); @@ -1510,7 +1479,7 @@ Status DBImpl::SetDBOptions( file_options_for_compaction_ = FileOptions(new_db_options); file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite( file_options_for_compaction_, immutable_db_options_); - versions_->ChangeFileOptions(mutable_db_options_); + versions_->UpdatedMutableDbOptions(mutable_db_options_, &mutex_); // TODO(xiez): clarify why apply optimize for read to write options file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead( file_options_for_compaction_, immutable_db_options_); @@ -1518,7 +1487,7 @@ Status DBImpl::SetDBOptions( WriteThread::Writer w; write_thread_.EnterUnbatched(&w, &mutex_); if (wal_other_option_changed || - total_log_size_ > GetMaxTotalWalSize()) { + wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize()) { Status purge_wal_status = SwitchWAL(&write_context); if (!purge_wal_status.ok()) { ROCKS_LOG_WARN(immutable_db_options_.info_log, @@ -1545,14 +1514,9 @@ Status DBImpl::SetDBOptions( ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded"); new_options.Dump(immutable_db_options_.info_log.get()); if (!persist_options_status.ok()) { - if (immutable_db_options_.fail_if_options_file_error) { - s = Status::IOError( - "SetDBOptions() succeeded, but unable to persist options", - persist_options_status.ToString()); - } - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Unable to persist options in SetDBOptions() -- %s", - persist_options_status.ToString().c_str()); + s = Status::IOError( + "SetDBOptions() succeeded, but unable to persist options", + persist_options_status.ToString()); } } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed"); @@ -1583,12 +1547,18 @@ int DBImpl::FindMinimumEmptyLevelFitting( return minimum_level; } +Status DBImpl::FlushWAL(const FlushWALOptions& options) { + WriteOptions write_options; + write_options.rate_limiter_priority = options.rate_limiter_priority; + return FlushWAL(write_options, options.sync); +} + Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) { if (manual_wal_flush_) { IOStatus io_s; { - // We need to lock log_write_mutex_ since logs_ might change concurrently - InstrumentedMutexLock wl(&log_write_mutex_); + // We need to lock wal_write_mutex_ since logs_ might change concurrently + InstrumentedMutexLock wl(&wal_write_mutex_); log::Writer* cur_log_writer = logs_.back().writer; io_s = cur_log_writer->WriteBuffer(write_options); } @@ -1615,7 +1585,7 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) { } bool DBImpl::WALBufferIsEmpty() { - InstrumentedMutexLock l(&log_write_mutex_); + InstrumentedMutexLock l(&wal_write_mutex_); log::Writer* cur_log_writer = logs_.back().writer; auto res = cur_log_writer->BufferIsEmpty(); return res; @@ -1623,7 +1593,7 @@ bool DBImpl::WALBufferIsEmpty() { Status DBImpl::GetOpenWalSizes(std::map& number_to_size) { assert(number_to_size.empty()); - InstrumentedMutexLock l(&log_write_mutex_); + InstrumentedMutexLock l(&wal_write_mutex_); for (auto& log : logs_) { auto* open_file = log.writer->file(); if (open_file) { @@ -1665,15 +1635,15 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal, uint64_t up_to_number; { - InstrumentedMutexLock l(&log_write_mutex_); + InstrumentedMutexLock l(&wal_write_mutex_); assert(!logs_.empty()); - maybe_active_number = logfile_number_; + maybe_active_number = cur_wal_number_; up_to_number = include_current_wal ? maybe_active_number : maybe_active_number - 1; while (logs_.front().number <= up_to_number && logs_.front().IsSyncing()) { - log_sync_cv_.Wait(); + wal_sync_cv_.Wait(); } // First check that logs are safe to sync in background. if (include_current_wal && @@ -1697,7 +1667,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal, } } - need_wal_dir_sync = !log_dir_synced_; + need_wal_dir_sync = !wal_dir_synced_; } if (include_current_wal) { @@ -1770,7 +1740,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal, /*arg=*/nullptr); } { - InstrumentedMutexLock l(&log_write_mutex_); + InstrumentedMutexLock l(&wal_write_mutex_); for (auto* wal : wals_internally_closed) { // We can only modify the state of log::Writer under the mutex bool was_closed = wal->PublishIfClosed(); @@ -1887,9 +1857,9 @@ Status DBImpl::UnlockWAL() { void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* synced_wals) { - log_write_mutex_.AssertHeld(); - if (synced_dir && logfile_number_ == up_to) { - log_dir_synced_ = true; + wal_write_mutex_.AssertHeld(); + if (synced_dir && cur_wal_number_ == up_to) { + wal_dir_synced_ = true; } for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { auto& wal = *it; @@ -1911,7 +1881,7 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, (immutable_db_options_.background_close_inactive_wals && wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize())) { // Fully synced - logs_to_free_.push_back(wal.ReleaseWriter()); + wals_to_free_.push_back(wal.ReleaseWriter()); it = logs_.erase(it); } else { wal.FinishSync(); @@ -1924,17 +1894,17 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, ++it; } } - log_sync_cv_.SignalAll(); + wal_sync_cv_.SignalAll(); } void DBImpl::MarkLogsNotSynced(uint64_t up_to) { - log_write_mutex_.AssertHeld(); + wal_write_mutex_.AssertHeld(); for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to; ++it) { auto& wal = *it; wal.FinishSync(); } - log_sync_cv_.SignalAll(); + wal_sync_cv_.SignalAll(); } SequenceNumber DBImpl::GetLatestSequenceNumber() const { @@ -1970,6 +1940,69 @@ Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family, return Status::OK(); } +Status DBImpl::GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family, + std::string* newest_timestamp) { + if (newest_timestamp == nullptr) { + return Status::InvalidArgument("newest_timestamp is nullptr"); + } + ColumnFamilyData* cfd = nullptr; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); + cfd = cfh->cfd(); + } + assert(cfd != nullptr && cfd->user_comparator() != nullptr); + if (cfd->user_comparator()->timestamp_size() == 0) { + return Status::InvalidArgument( + "Timestamp is not enabled in this column family"); + } + if (cfd->ioptions().persist_user_defined_timestamps) { + return Status::NotSupported( + "GetNewestUserDefinedTimestamp doesn't support the case when user" + "defined timestamps are persisted."); + } + + Status status; + // Acquire SuperVersion + SuperVersion* sv = GetAndRefSuperVersion(cfd); + { + InstrumentedMutexLock l(&mutex_); + bool enter_write_thread = sv->mem == cfd->mem(); + WriteThread::Writer w; + // Enter write thread to read the mutable memtable to avoid racing access + // with concurrent writes. No need to enter nonmem_write_thread_ since this + // call only care about memtable writes, not WAL writes. + if (enter_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + WaitForPendingWrites(); + } + *newest_timestamp = sv->mem->GetNewestUDT().ToString(); + assert(!newest_timestamp->empty() || sv->mem->IsEmpty()); + if (enter_write_thread) { + write_thread_.ExitUnbatched(&w); + } + } + // Read from immutable memtables if nothing found in mutable memtable. + if (newest_timestamp->empty()) { + *newest_timestamp = sv->imm->GetNewestUDT().ToString(); + } + // Read from SST files if no result can be found in memtables. + if (newest_timestamp->empty() && sv->current->GetSstFilesSize() != 0) { + // full_history_ts_low is used to track the exclusive upperbound of + // flushed user defined timestamp. So we can use it to deduce the newest + // timestamp in the SST files that the column family has seen. + Slice full_history_ts_low = sv->full_history_ts_low; + if (!full_history_ts_low.empty()) { + GetU64CutoffTsFromFullHistoryTsLow(&full_history_ts_low, + newest_timestamp); + } + } + ReturnAndCleanupSuperVersion(cfd, sv); + return status; +} + InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, Arena* arena, SequenceNumber sequence, @@ -2003,10 +2036,10 @@ void DBImpl::BackgroundCallPurge() { TEST_SYNC_POINT("DBImpl::BackgroundCallPurge:beforeMutexLock"); mutex_.Lock(); - while (!logs_to_free_queue_.empty()) { - assert(!logs_to_free_queue_.empty()); - log::Writer* log_writer = *(logs_to_free_queue_.begin()); - logs_to_free_queue_.pop_front(); + while (!wals_to_free_queue_.empty()) { + assert(!wals_to_free_queue_.empty()); + log::Writer* log_writer = *(wals_to_free_queue_.begin()); + wals_to_free_queue_.pop_front(); mutex_.Unlock(); delete log_writer; mutex_.Lock(); @@ -2704,7 +2737,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options, } }; - bool last_try = false; + bool acquire_mutex = false; if (cf_list->size() == 1) { // Fast path for a single column family. We can simply get the thread local // super version @@ -2753,29 +2786,32 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options, // sure. constexpr int num_retries = 3; for (int i = 0; i < num_retries; ++i) { - last_try = (i == num_retries - 1); + // When reading from kPersistedTier, we want a consistent view into CFs. + // So we take mutex to prevent any SV change in any CF. + acquire_mutex = ((i == num_retries - 1) && !read_options.snapshot) || + read_options.read_tier == kPersistedTier; bool retry = false; if (i > 0) { sv_cleanup_func(); } if (read_options.snapshot == nullptr) { - if (last_try) { - TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry"); - // We're close to max number of retries. For the last retry, - // acquire the lock so we're sure to succeed - mutex_.Lock(); - } *snapshot = GetLastPublishedSequence(); } else { *snapshot = static_cast_with_check(read_options.snapshot) ->number_; } + if (acquire_mutex) { + TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry"); + // We're close to max number of retries. For the last retry, + // acquire the lock so we're sure to succeed + mutex_.Lock(); + } for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); ++cf_iter) { auto node = iter_deref_func(cf_iter); - if (!last_try) { + if (!acquire_mutex) { if (extra_sv_ref) { node->super_version = node->cfd->GetReferencedSuperVersion(this); } else { @@ -2799,7 +2835,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options, } } TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::BeforeCheckingSnapshot"); - if (read_options.snapshot != nullptr || last_try) { + if (read_options.snapshot != nullptr || acquire_mutex) { // If user passed a snapshot, then we don't care if a memtable is // sealed or compaction happens because the snapshot would ensure // that older key versions are kept around. If this is the last @@ -2810,7 +2846,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options, // memtables, which will include immutable memtables as well, but that // might be tricky to maintain in case we decide, in future, to do // memtable compaction. - if (!last_try) { + if (!acquire_mutex) { SequenceNumber seq = node->super_version->mem->GetEarliestSequenceNumber(); if (seq > *snapshot) { @@ -2820,19 +2856,20 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options, } } if (!retry) { - if (last_try) { + if (acquire_mutex) { mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::AfterLastTryRefSV"); } break; } + assert(!acquire_mutex); } } TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum1"); TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum2"); PERF_TIMER_STOP(get_snapshot_time); - *sv_from_thread_local = !last_try; + *sv_from_thread_local = !acquire_mutex; if (!s.ok()) { sv_cleanup_func(); } @@ -3538,7 +3575,7 @@ void DBImpl::MultiGetEntityWithCallback( } Status DBImpl::WrapUpCreateColumnFamilies( - const ReadOptions& read_options, const WriteOptions& write_options, + const WriteOptions& write_options, const std::vector& cf_options) { options_mutex_.AssertHeld(); @@ -3555,8 +3592,7 @@ Status DBImpl::WrapUpCreateColumnFamilies( // Attempt both follow-up actions even if one fails Status s = WriteOptionsFile(write_options, false /*db_mutex_already_held*/); if (register_worker) { - s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(read_options, write_options, - /* is_new_db */ false)); + s.UpdateIfOk(RegisterRecordSeqnoTimeWorker()); } return s; } @@ -3571,8 +3607,7 @@ Status DBImpl::CreateColumnFamily(const ReadOptions& read_options, Status s = CreateColumnFamilyImpl(read_options, write_options, cf_options, column_family, handle); if (s.ok()) { - s.UpdateIfOk( - WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options})); + s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options})); } return s; } @@ -3599,8 +3634,7 @@ Status DBImpl::CreateColumnFamilies( success_once = true; } if (success_once) { - s.UpdateIfOk( - WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options})); + s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options})); } return s; } @@ -3630,8 +3664,7 @@ Status DBImpl::CreateColumnFamilies( cf_opts.push_back(&column_families[i].options); } if (success_once) { - s.UpdateIfOk( - WrapUpCreateColumnFamilies(read_options, write_options, cf_opts)); + s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, cf_opts)); } return s; } @@ -3672,7 +3705,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options, edit.AddColumnFamily(column_family_name); uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); edit.SetColumnFamily(new_id); - edit.SetLogNumber(logfile_number_); + edit.SetLogNumber(cur_wal_number_); edit.SetComparatorName(cf_options.comparator->Name()); edit.SetPersistUserDefinedTimestamps( cf_options.persist_user_defined_timestamps); @@ -3700,7 +3733,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options, auto* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); assert(cfd != nullptr); - InstallSuperVersionAndScheduleWork(cfd, &sv_context); + InstallSuperVersionForConfigChange(cfd, &sv_context); if (!cfd->mem()->IsSnapshotSupported()) { is_snapshot_supported_ = false; @@ -3784,7 +3817,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { Status s; // Save re-aquiring lock for RegisterRecordSeqnoTimeWorker when not // applicable - bool used_preserve_preclude = false; + MinAndMaxPreserveSeconds preserve_info; { InstrumentedMutexLock l(&mutex_); if (cfd->IsDropped()) { @@ -3802,8 +3835,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { auto& moptions = cfd->GetLatestMutableCFOptions(); max_total_in_memory_state_ -= moptions.write_buffer_size * moptions.max_write_buffer_number; - used_preserve_preclude = moptions.preserve_internal_time_seconds > 0 || - moptions.preclude_last_level_data_seconds > 0; + preserve_info.Combine(moptions); } if (!cf_support_snapshot) { @@ -3821,9 +3853,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { bg_cv_.SignalAll(); } - if (used_preserve_preclude) { - s = RegisterRecordSeqnoTimeWorker(read_options, write_options, - /* is_new_db */ false); + if (preserve_info.IsEnabled()) { + s = RegisterRecordSeqnoTimeWorker(); } if (s.ok()) { @@ -3873,6 +3904,14 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, return s.ok() || s.IsIncomplete(); } +std::unique_ptr DBImpl::NewMultiScan( + const ReadOptions& _read_options, ColumnFamilyHandle* column_family, + const MultiScanArgs& scan_opts) { + std::unique_ptr ms_iter = std::make_unique( + _read_options, scan_opts, this, column_family); + return ms_iter; +} + Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) { if (_read_options.io_activity != Env::IOActivity::kUnknown && @@ -3886,10 +3925,6 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, read_options.io_activity = Env::IOActivity::kDBIterator; } - if (read_options.managed) { - return NewErrorIterator( - Status::NotSupported("Managed iterator is not supported anymore.")); - } Iterator* result = nullptr; if (read_options.read_tier == kPersistedTier) { return NewErrorIterator(Status::NotSupported( @@ -3929,11 +3964,14 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, auto iter = new ForwardIterator(this, read_options, cfd, sv, /* allow_unprepared_value */ true); - result = NewDBIterator( - env_, read_options, cfd->ioptions(), sv->mutable_cf_options, - cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - nullptr /* read_callback */, cfh); + // TODO(cbi): Add support for `memtable_op_scan_flush_trigger` for tailing + // iterator. This requires refreshing DBIter's pointer to active_mem when + // tailing iterator refreshes to new memtable internally. + result = DBIter::NewIter(env_, read_options, cfd->ioptions(), + sv->mutable_cf_options, cfd->user_comparator(), + iter, sv->current, kMaxSequenceNumber, + /*read_callback=*/nullptr, /*active_mem=*/nullptr, + cfh, /*expose_blob_index=*/false); } else { // Note: no need to consider the special case of // last_seq_same_as_publish_seq_==false since NewIterator is overridden in @@ -4011,18 +4049,9 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl( // Laying out the iterators in the order of being accessed makes it more // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. - ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, read_options, cfh->cfd()->ioptions(), sv->mutable_cf_options, - sv->current, snapshot, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_callback, cfh, expose_blob_index, allow_refresh); - - InternalIterator* internal_iter = NewInternalIterator( - db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), snapshot, - /* allow_unprepared_value */ true, db_iter); - db_iter->SetIterUnderDBIter(internal_iter); - - return db_iter; + return NewArenaWrappedDbIterator( + env_, read_options, cfh, sv, snapshot, read_callback, this, + expose_blob_index, allow_refresh, /*allow_mark_memtable_for_flush=*/true); } std::unique_ptr DBImpl::NewCoalescingIterator( @@ -4095,9 +4124,6 @@ Status DBImpl::NewIterators( if (read_options.io_activity == Env::IOActivity::kUnknown) { read_options.io_activity = Env::IOActivity::kDBIterator; } - if (read_options.managed) { - return Status::NotSupported("Managed iterator is not supported anymore."); - } if (read_options.read_tier == kPersistedTier) { return Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators."); @@ -4146,14 +4172,12 @@ Status DBImpl::NewIterators( auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd, cf_sv_pair.super_version, /* allow_unprepared_value */ true); - iterators->push_back( - NewDBIterator(env_, read_options, cf_sv_pair.cfd->ioptions(), - cf_sv_pair.super_version->mutable_cf_options, - cf_sv_pair.cfd->user_comparator(), iter, - cf_sv_pair.super_version->current, kMaxSequenceNumber, - cf_sv_pair.super_version->mutable_cf_options - .max_sequential_skip_in_iterations, - nullptr /*read_callback*/, cf_sv_pair.cfh)); + iterators->push_back(DBIter::NewIter( + env_, read_options, cf_sv_pair.cfd->ioptions(), + cf_sv_pair.super_version->mutable_cf_options, + cf_sv_pair.cfd->user_comparator(), iter, + cf_sv_pair.super_version->current, kMaxSequenceNumber, + nullptr /*read_callback*/, /*active_mem=*/nullptr, cf_sv_pair.cfh)); } } else { for (const auto& cf_sv_pair : cf_sv_pairs) { @@ -4385,9 +4409,10 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { CfdList cf_scheduled; if (oldest_snapshot > bottommost_files_mark_threshold_) { for (auto* cfd : *versions_->GetColumnFamilySet()) { - if (!cfd->ioptions().allow_ingest_behind) { + if (!cfd->AllowIngestBehind()) { cfd->current()->storage_info()->UpdateOldestSnapshot( - oldest_snapshot, /*allow_ingest_behind=*/false); + oldest_snapshot, /*allow_ingest_behind=*/false, + cfd->ioptions().user_comparator, cfd->GetFullHistoryTsLow()); if (!cfd->current() ->storage_info() ->BottommostFilesMarkedForCompaction() @@ -4405,8 +4430,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { // inaccurate. SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber; for (auto* cfd : *versions_->GetColumnFamilySet()) { - if (CfdListContains(cf_scheduled, cfd) || - cfd->ioptions().allow_ingest_behind) { + if (CfdListContains(cf_scheduled, cfd) || cfd->AllowIngestBehind()) { continue; } new_bottommost_files_mark_threshold = std::min( @@ -4485,7 +4509,7 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, // Add timestamp if needed for (size_t i = 0; i < n; i++) { auto [start, limit] = MaybeAddTimestampsToRange( - &range[i].start, &range[i].limit, ts_sz, &keys.emplace_back(), + range[i].start, range[i].limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(), /*exclusive_end=*/false); assert(start.has_value()); assert(limit.has_value()); @@ -4502,6 +4526,29 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, return s; } +Status DBImpl::GetPropertiesOfTablesByLevel( + ColumnFamilyHandle* column_family, + std::vector>* props_by_level) { + auto cfh = static_cast_with_check(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + const ReadOptions read_options; + auto s = version->GetPropertiesOfTablesByLevel(read_options, props_by_level); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + const std::string& DBImpl::GetName() const { return dbname_; } Env* DBImpl::GetEnv() const { return env_; } @@ -4802,7 +4849,7 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, // Add timestamp if needed std::string start_with_ts, limit_with_ts; auto [start, limit] = MaybeAddTimestampsToRange( - &range.start, &range.limit, ts_sz, &start_with_ts, &limit_with_ts); + range.start, range.limit, ts_sz, &start_with_ts, &limit_with_ts); assert(start.has_value()); assert(limit.has_value()); // Convert user_key into a corresponding internal key. @@ -4840,9 +4887,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, for (int i = 0; i < n; i++) { // Add timestamp if needed std::string start_with_ts, limit_with_ts; - auto [start, limit] = - MaybeAddTimestampsToRange(&range[i].start, &range[i].limit, ts_sz, - &start_with_ts, &limit_with_ts); + auto [start, limit] = MaybeAddTimestampsToRange( + range[i].start, range[i].limit, ts_sz, &start_with_ts, &limit_with_ts); assert(start.has_value()); assert(limit.has_value()); // Convert user_key into a corresponding internal key. @@ -4918,7 +4964,7 @@ Status DBImpl::GetUpdatesSince( } Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, - const RangePtr* ranges, size_t n, + const RangeOpt* ranges, size_t n, bool include_end) { // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; @@ -4930,7 +4976,7 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, const Comparator* ucmp = cfd->user_comparator(); assert(ucmp); const size_t ts_sz = ucmp->timestamp_size(); - autovector ukey_ranges; + autovector ukey_ranges; std::vector keys; std::vector key_slices; ukey_ranges.reserve(n); @@ -4940,8 +4986,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, auto [start, limit] = MaybeAddTimestampsToRange( ranges[i].start, ranges[i].limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(), !include_end); - assert((ranges[i].start != nullptr) == start.has_value()); - assert((ranges[i].limit != nullptr) == limit.has_value()); + assert(ranges[i].start.has_value() == start.has_value()); + assert(ranges[i].limit.has_value() == limit.has_value()); ukey_ranges.emplace_back(start, limit); } @@ -5002,7 +5048,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, } if (!deleted_files.empty()) { vstorage->ComputeCompactionScore(cfd->ioptions(), - cfd->GetLatestMutableCFOptions()); + cfd->GetLatestMutableCFOptions(), + cfd->GetFullHistoryTsLow()); } if (edit.GetDeletedFiles().empty()) { job_context.Clean(); @@ -5047,7 +5094,6 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, assert(column_family); auto* cfd = static_cast_with_check(column_family)->cfd(); - auto* sv = GetAndRefSuperVersion(cfd); { // Without mutex, Version::GetColumnFamilyMetaData will have data race // with Compaction::MarkFilesBeingCompacted. One solution is to use mutex, @@ -5059,9 +5105,21 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, // DB::GetColumnFamilyMetaData is not called frequently, the regression // should not be big. We still need to keep an eye on it. InstrumentedMutexLock l(&mutex_); - sv->current->GetColumnFamilyMetaData(cf_meta); + cfd->current()->GetColumnFamilyMetaData(cf_meta); + } +} + +void DBImpl::GetColumnFamilyMetaData( + ColumnFamilyHandle* column_family, + const GetColumnFamilyMetaDataOptions& options, + ColumnFamilyMetaData* metadata) { + assert(column_family); + auto* cfd = + static_cast_with_check(column_family)->cfd(); + { + InstrumentedMutexLock l(&mutex_); + cfd->current()->GetColumnFamilyMetaData(options, metadata); } - ReturnAndCleanupSuperVersion(cfd, sv); } void DBImpl::GetAllColumnFamilyMetaData( @@ -5075,85 +5133,6 @@ void DBImpl::GetAllColumnFamilyMetaData( } } -Status DBImpl::CheckConsistency() { - mutex_.AssertHeld(); - std::vector metadata; - versions_->GetLiveFilesMetaData(&metadata); - TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData"); - - std::string corruption_messages; - - if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { - // Instead of calling GetFileSize() for each expected file, call - // GetChildren() for the DB directory and check that all expected files - // are listed, without checking their sizes. - // Since sst files might be in different directories, do it for each - // directory separately. - std::map> files_by_directory; - for (const auto& md : metadata) { - // md.name has a leading "/". Remove it. - std::string fname = md.name; - if (!fname.empty() && fname[0] == '/') { - fname = fname.substr(1); - } - files_by_directory[md.db_path].push_back(fname); - } - - IOOptions io_opts; - io_opts.do_not_recurse = true; - for (const auto& dir_files : files_by_directory) { - std::string directory = dir_files.first; - std::vector existing_files; - Status s = fs_->GetChildren(directory, io_opts, &existing_files, - /*IODebugContext*=*/nullptr); - if (!s.ok()) { - corruption_messages += - "Can't list files in " + directory + ": " + s.ToString() + "\n"; - continue; - } - std::sort(existing_files.begin(), existing_files.end()); - - for (const std::string& fname : dir_files.second) { - if (!std::binary_search(existing_files.begin(), existing_files.end(), - fname) && - !std::binary_search(existing_files.begin(), existing_files.end(), - Rocks2LevelTableFileName(fname))) { - corruption_messages += - "Missing sst file " + fname + " in " + directory + "\n"; - } - } - } - } else { - for (const auto& md : metadata) { - // md.name has a leading "/". - std::string file_path = md.db_path + md.name; - - uint64_t fsize = 0; - TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); - Status s = env_->GetFileSize(file_path, &fsize); - if (!s.ok() && - env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { - s = Status::OK(); - } - if (!s.ok()) { - corruption_messages += - "Can't access " + md.name + ": " + s.ToString() + "\n"; - } else if (fsize != md.size) { - corruption_messages += "Sst file size mismatch: " + file_path + - ". Size recorded in manifest " + - std::to_string(md.size) + ", actual size " + - std::to_string(fsize) + "\n"; - } - } - } - - if (corruption_messages.size() == 0) { - return Status::OK(); - } else { - return Status::Corruption(corruption_messages); - } -} - Status DBImpl::GetDbIdentity(std::string& identity) const { identity.assign(db_id_); return Status::OK(); @@ -5490,12 +5469,7 @@ Status DBImpl::WriteOptionsFile(const WriteOptions& write_options, if (!s.ok()) { ROCKS_LOG_WARN(immutable_db_options_.info_log, "Unnable to persist options -- %s", s.ToString().c_str()); - if (immutable_db_options_.fail_if_options_file_error) { - s = Status::IOError("Unable to persist options.", s.ToString().c_str()); - } else { - // Ignore error - s = Status::OK(); - } + s = Status::IOError("Unable to persist options.", s.ToString().c_str()); } // Restore lock if appropriate @@ -5607,7 +5581,7 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name, return s; } -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const { if (immutable_db_options_.enable_thread_tracking) { @@ -5634,7 +5608,7 @@ void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} void DBImpl::EraseThreadStatusDbInfo() const {} -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS // // A global method that can dump out the build version @@ -5865,10 +5839,6 @@ Status DBImpl::IngestExternalFiles( for (const auto& arg : args) { const IngestExternalFileOptions& ingest_opts = arg.options; if (ingest_opts.ingest_behind) { - if (!immutable_db_options_.allow_ingest_behind) { - return Status::InvalidArgument( - "can't ingest_behind file in DB with allow_ingest_behind=false"); - } auto ucmp = arg.column_family->GetComparator(); assert(ucmp); if (ucmp->timestamp_size() > 0) { @@ -5876,7 +5846,36 @@ Status DBImpl::IngestExternalFiles( "Column family with user-defined " "timestamps enabled doesn't support ingest behind."); } + + if (!static_cast(arg.column_family) + ->cfd() + ->AllowIngestBehind()) { + return Status::InvalidArgument( + "Can't ingest_behind file in ColumnFamily %s with " + "cf_allow_ingest_behind=false"); + } + } + if (arg.atomic_replace_range.has_value()) { + if (ingest_opts.ingest_behind) { + return Status::InvalidArgument( + "Can't combine atomic_replace_range with ingest_behind."); + } + if (ingest_opts.snapshot_consistency) { + // TODO: support generating and ingesting a big tombstone file, which + // might depend on non-nullptr start and limit + return Status::NotSupported( + "atomic_replace_range not yet supported with " + "snapshot_consistency."); + } else { + if (arg.atomic_replace_range->start.has_value() ^ + arg.atomic_replace_range->limit.has_value()) { + return Status::NotSupported( + "Only one of atomic_replace_range.{start,limit}.has_value() is " + "not supported."); + } + } } + if (ingest_opts.allow_db_generated_files) { if (ingest_opts.write_global_seqno) { return Status::NotSupported( @@ -5925,8 +5924,8 @@ Status DBImpl::IngestExternalFiles( this); Status es = ingestion_jobs[i].Prepare( args[i].external_files, args[i].files_checksums, - args[i].files_checksum_func_names, args[i].file_temperature, - start_file_number, super_version); + args[i].files_checksum_func_names, args[i].atomic_replace_range, + args[i].file_temperature, start_file_number, super_version); // capture first error only if (!es.ok() && status.ok()) { status = es; @@ -5941,8 +5940,8 @@ Status DBImpl::IngestExternalFiles( this); Status es = ingestion_jobs[0].Prepare( args[0].external_files, args[0].files_checksums, - args[0].files_checksum_func_names, args[0].file_temperature, - next_file_number, super_version); + args[0].files_checksum_func_names, args[0].atomic_replace_range, + args[0].file_temperature, next_file_number, super_version); if (!es.ok()) { status = es; } @@ -6089,18 +6088,19 @@ Status DBImpl::IngestExternalFiles( // mutex when persisting MANIFEST file, and the snapshots taken during // that period will not be stable if VersionSet last seqno is updated // before LogAndApply. - int consumed_seqno_count = - ingestion_jobs[0].ConsumedSequenceNumbersCount(); + SequenceNumber max_assigned_seqno = + ingestion_jobs[0].MaxAssignedSequenceNumber(); for (size_t i = 1; i != num_cfs; ++i) { - consumed_seqno_count = - std::max(consumed_seqno_count, - ingestion_jobs[i].ConsumedSequenceNumbersCount()); + max_assigned_seqno = std::max( + max_assigned_seqno, ingestion_jobs[i].MaxAssignedSequenceNumber()); } - if (consumed_seqno_count > 0) { + if (max_assigned_seqno > 0) { const SequenceNumber last_seqno = versions_->LastSequence(); - versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count); - versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count); - versions_->SetLastSequence(last_seqno + consumed_seqno_count); + if (max_assigned_seqno > last_seqno) { + versions_->SetLastAllocatedSequence(max_assigned_seqno); + versions_->SetLastPublishedSequence(max_assigned_seqno); + versions_->SetLastSequence(max_assigned_seqno); + } } } @@ -6235,7 +6235,7 @@ Status DBImpl::CreateColumnFamilyWithImport( versions_->LogAndApply(cfd, read_options, write_options, &dummy_edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx); + InstallSuperVersionForConfigChange(cfd, &dummy_sv_ctx); } } } @@ -6272,7 +6272,7 @@ Status DBImpl::CreateColumnFamilyWithImport( import_job.edit(), &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork(cfd, &sv_context); + InstallSuperVersionForConfigChange(cfd, &sv_context); } } @@ -6333,9 +6333,9 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, if (status.ok()) { // DeleteFilesInRanges non-overlap files except L0 - std::vector ranges; - ranges.emplace_back(nullptr, &begin_key); - ranges.emplace_back(&end_key, nullptr); + std::vector ranges; + ranges.emplace_back(OptSlice{}, begin_key); + ranges.emplace_back(end_key, OptSlice{}); status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size()); } @@ -6480,8 +6480,11 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, fmeta->file_checksum_func_name, fname, read_options); } else { + FileOptions fopts = file_options_; + fopts.file_checksum = fmeta->file_checksum; + fopts.file_checksum_func_name = fmeta->file_checksum_func_name; s = ROCKSDB_NAMESPACE::VerifySstFileChecksumInternal( - opts, file_options_, read_options, fname, fd.largest_seqno); + opts, fopts, read_options, fname, fd.largest_seqno); } RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, IOSTATS(bytes_read) - prev_bytes_read); @@ -6549,12 +6552,15 @@ Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, } std::string file_checksum; std::string func_name; + FileOptions fopts; + fopts.file_checksum = file_checksum_expected; + fopts.file_checksum_func_name = func_name_expected; s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum( fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(), func_name_expected, &file_checksum, &func_name, read_options.readahead_size, immutable_db_options_.allow_mmap_reads, io_tracer_, immutable_db_options_.rate_limiter.get(), read_options, - immutable_db_options_.stats, immutable_db_options_.clock); + immutable_db_options_.stats, immutable_db_options_.clock, fopts); if (s.ok()) { assert(func_name_expected == func_name); if (file_checksum != file_checksum_expected) { @@ -6732,7 +6738,7 @@ Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) { } } -void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) { +std::pair DBImpl::GetSeqnoToTimeSample() const { // TECHNICALITY: Sample last sequence number *before* time, as prescribed // for SeqnoToTimeMapping. We don't know how long it has been since the last // sequence number was written, so we at least have a one-sided bound by @@ -6741,62 +6747,191 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) { // while holding the DB mutex. (This is really to make testing happy because // it's fine to throw out extra close-but-not-quite-consistent mappings in // production.) - std::vector sv_contexts; - bool success = true; - SequenceNumber seqno; - uint64_t unix_time; + mutex_.AssertHeld(); + SequenceNumber seqno = GetLatestSequenceNumber(); + // HACK/TODO: seqno might be zero but we can't record a mapping for that. + // Start with 1, which should be close enough. + seqno = std::max(seqno, SequenceNumber{1}); + int64_t unix_time_signed = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time_signed) + .PermitUncheckedError(); // Ignore error + return {seqno, static_cast(unix_time_signed)}; +} + +void DBImpl::EnsureSeqnoToTimeMapping( + const MinAndMaxPreserveSeconds& preserve_info) { + mutex_.AssertHeld(); + assert(preserve_info.IsEnabled()); + + // Atomically with CF creation or mutable option change (see + // InstallSuperVersionForConfigChange()), we need to be sure any data written + // after setting preserve/preclude options must have a reasonable time + // estimate (so that we can accurately place the data), which means at least + // one entry in seqno_to_time_mapping_. It's not critical that `preserve_info` + // take into account all CFs, as that's mostly relevant to how we add + // recurring entries and purge old ones. + + auto [seqno, unix_time_now] = GetSeqnoToTimeSample(); + // Ensure at least one sample that is sufficiently recent + uint64_t unix_time_last_sample = 0; + if (seqno_to_time_mapping_.Empty()) { + // The exact best settings will be found and applied in + // RegisterRecordSeqnoTimeWorker() + seqno_to_time_mapping_.SetCapacity(kMaxSeqnoToTimeEntries); + } else { + unix_time_last_sample = + seqno_to_time_mapping_.GetProximalTimeBeforeSeqno(kMaxSequenceNumber); + } + uint64_t cadence = preserve_info.GetRecodingCadence(); + // Extend cadence so as to avoid stepping on toes of recorder job, which + // could lag a bit. + cadence += 3 + cadence / 100; + if (unix_time_now >= cadence && + unix_time_last_sample <= unix_time_now - cadence) { + assert(seqno > 0); // See GetSeqnoToTimeSample() + // Always successful assuming seqno never go backwards + seqno_to_time_mapping_.Append(seqno, unix_time_now); + } +} + +void DBImpl::PrepopulateSeqnoToTimeMapping( + const MinAndMaxPreserveSeconds& preserve_info) { + // Only for opening a new DB, with preserve/preclude options set + if (!preserve_info.IsEnabled()) { + assert(false); + return; + } + if (GetLatestSequenceNumber() != 0) { + assert(false); + return; + } + + // Here we fulfill the following promise: + // + // Any DB/CF created with preserve/preclude options set from the beginning + // will get pre-allocated seqnos with pre-populated time mappings back to + // the times we are interested in. (This will enable future import of data + // while preserving rough write time. We can only do this reliably from + // DB::Open, as otherwise there could be a race between CreateColumnFamily + // and the first Write to the DB, and seqno-to-time mappings need to be + // monotonic. + // + // FIXME: We don't currently guarantee that if the first column family with + // that setting is added or configured after initial DB::Open but before + // the first user Write. Fixing this causes complications with the crash + // test because if DB starts without preserve/preclude option, does some + // user writes but all those writes are lost in crash, then re-opens with + // preserve/preclude option, it sees seqno==1 which looks like one of the + // user writes was recovered, when actually it was not. + + // Pre-allocate seqnos and pre-populate historical mapping + // We can simply modify these, before writes are allowed + constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST; + versions_->SetLastAllocatedSequence(kMax); + versions_->SetLastPublishedSequence(kMax); + versions_->SetLastSequence(kMax); + + // And record in manifest, to avoid going backwards in seqno on re-open + // (potentially with different options). Concurrency is simple because we + // are in DB::Open + const WriteOptions write_options(Env::IOActivity::kDBOpen); + const ReadOptions read_options(Env::IOActivity::kDBOpen); + VersionEdit edit; + edit.SetLastSequence(kMax); + Status s = versions_->LogAndApplyToDefaultColumnFamily( + read_options, write_options, &edit, &mutex_, directories_.GetDbDir()); + if (!s.ok() && versions_->io_status().IsIOError()) { + error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + + auto [seqno, unix_time_now] = GetSeqnoToTimeSample(); + uint64_t populate_historical_seconds = preserve_info.max_preserve_seconds; + if (seqno > 1 && unix_time_now > populate_historical_seconds) { + // seqno=0 is reserved + SequenceNumber from_seqno = 1; + seqno_to_time_mapping_.PrePopulate( + from_seqno, seqno, unix_time_now - populate_historical_seconds, + unix_time_now); + } else { + // One of these will fail + assert(seqno > 1); + assert(unix_time_now > populate_historical_seconds); + } +} + +void DBImpl::InstallSuperVersionForConfigChange( + ColumnFamilyData* cfd, SuperVersionContext* sv_context) { + MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()}; + std::shared_ptr new_seqno_to_time_mapping; + if (preserve_info.IsEnabled()) { + // TODO: detect & optimize if mapping hasn't changed from previous + // SuperVersion + EnsureSeqnoToTimeMapping(preserve_info); + new_seqno_to_time_mapping = std::make_shared(); + new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_); + } + InstallSuperVersionAndScheduleWork(cfd, sv_context, + std::move(new_seqno_to_time_mapping)); +} + +void DBImpl::RecordSeqnoToTimeMapping() { + SuperVersionContext sv_context; { InstrumentedMutexLock l(&mutex_); - - seqno = GetLatestSequenceNumber(); - int64_t unix_time_signed = 0; - immutable_db_options_.clock->GetCurrentTime(&unix_time_signed) - .PermitUncheckedError(); // Ignore error - unix_time = static_cast(unix_time_signed); - - if (populate_historical_seconds > 0) { - if (seqno > 1 && unix_time > populate_historical_seconds) { - // seqno=0 is reserved - SequenceNumber from_seqno = 1; - success = seqno_to_time_mapping_.PrePopulate( - from_seqno, seqno, unix_time - populate_historical_seconds, - unix_time); - InstallSeqnoToTimeMappingInSV(&sv_contexts); - } else { - // One of these will fail - assert(seqno > 1); - assert(unix_time > populate_historical_seconds); - success = false; + // Record next sample + seqno_to_time_mapping_.Append(GetSeqnoToTimeSample()); + // Create an immutable snapshot for sharing across CFs + std::shared_ptr new_seqno_to_time_mapping = + std::make_shared(); + new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_); + + // Update in SV of all applicable CFs + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()}; + if (preserve_info.IsEnabled()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &mutex_, + new_seqno_to_time_mapping); } - } else { - // FIXME: assert(seqno > 0); - // Always successful assuming seqno never go backwards - seqno_to_time_mapping_.Append(seqno, unix_time); - InstallSeqnoToTimeMappingInSV(&sv_contexts); } + bg_cv_.SignalAll(); } // clean up & report outside db mutex - for (SuperVersionContext& sv_context : sv_contexts) { - sv_context.Clean(); - } + sv_context.Clean(); +} - if (populate_historical_seconds > 0) { - if (success) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "Pre-populated sequence number to time entries: [1,%" PRIu64 - "] -> [%" PRIu64 ",%" PRIu64 "]", - seqno, unix_time - populate_historical_seconds, unix_time); - } else { - ROCKS_LOG_WARN( - immutable_db_options_.info_log, - "Failed to pre-populate sequence number to time entries: [1,%" PRIu64 - "] -> [%" PRIu64 ",%" PRIu64 "]", - seqno, unix_time - populate_historical_seconds, unix_time); +void DBImpl::TriggerPeriodicCompaction() { + TEST_SYNC_POINT("DBImpl::TriggerPeriodicCompaction:StartRunning"); + { + InstrumentedMutexLock l(&mutex_); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Running the periodic task to trigger compactions."); + + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->GetLatestCFOptions().periodic_compaction_seconds && + !cfd->queued_for_compaction()) { + cfd->current()->storage_info()->ComputeCompactionScore( + cfd->ioptions(), cfd->GetLatestMutableCFOptions(), + cfd->GetFullHistoryTsLow()); + EnqueuePendingCompaction(cfd); + if (cfd->queued_for_compaction()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Periodic task to trigger compaction queued Column " + "family [%s] for compaction.", + cfd->GetName().c_str()); + } + } } - } else { - assert(success); + MaybeScheduleFlushOrCompaction(); + bg_cv_.SignalAll(); } } @@ -6856,22 +6991,4 @@ void DBImpl::TrackOrUntrackFiles( } } -void DBImpl::InstallSeqnoToTimeMappingInSV( - std::vector* sv_contexts) { - mutex_.AssertHeld(); - std::shared_ptr new_seqno_to_time_mapping = - std::make_shared(); - new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_); - for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { - if (cfd->IsDropped()) { - continue; - } - sv_contexts->emplace_back(/*create_superversion=*/true); - sv_contexts->back().new_seqno_to_time_mapping = new_seqno_to_time_mapping; - cfd->InstallSuperVersion(&sv_contexts->back(), - cfd->GetLatestMutableCFOptions()); - } - bg_cv_.SignalAll(); -} - } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 9c3f4dbd7cd9..c3c432bec8d8 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -173,10 +173,10 @@ struct DBOpenLogRecordReadReporter : public log::Reader::Reporter { void OldLogRecord(size_t bytes) override; - uint64_t GetCorruptedLogNumber() const { return corrupted_log_number_; } + uint64_t GetCorruptedLogNumber() const { return corrupted_wal_number_; } private: - uint64_t corrupted_log_number_ = kMaxSequenceNumber; + uint64_t corrupted_wal_number_ = kMaxSequenceNumber; }; // While DB is the public interface of RocksDB, and DBImpl is the actual @@ -256,6 +256,10 @@ class DBImpl : public DB { Status WriteWithCallback(const WriteOptions& options, WriteBatch* updates, UserWriteCallback* user_write_cb) override; + Status IngestWriteBatchWithIndex( + const WriteOptions& options, + std::shared_ptr wbwi) override; + using DB::Get; Status Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, @@ -379,6 +383,11 @@ class DBImpl : public DB { const std::vector& column_families, std::vector* iterators) override; + using DB::NewMultiScan; + std::unique_ptr NewMultiScan( + const ReadOptions& _read_options, ColumnFamilyHandle* column_family, + const MultiScanArgs& scan_opts) override; + const Snapshot* GetSnapshot() override; void ReleaseSnapshot(const Snapshot* snapshot) override; @@ -446,19 +455,20 @@ class DBImpl : public DB { void EnableManualCompaction() override; void DisableManualCompaction() override; + void AbortAllCompactions() override; + void ResumeAllCompactions() override; using DB::SetOptions; Status SetOptions( - ColumnFamilyHandle* column_family, - const std::unordered_map& options_map) override; + const std::unordered_map>& + column_families_opts_map) override; Status SetDBOptions( const std::unordered_map& options_map) override; using DB::NumberLevels; int NumberLevels(ColumnFamilyHandle* column_family) override; - using DB::MaxMemCompactionLevel; - int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override; using DB::Level0StopWriteTrigger; int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override; const std::string& GetName() const override; @@ -475,10 +485,13 @@ class DBImpl : public DB { const FlushOptions& options, const std::vector& column_families) override; Status FlushWAL(bool sync) override { - // TODO: plumb Env::IOActivity, Env::IOPriority - return FlushWAL(WriteOptions(), sync); + FlushWALOptions options; + options.sync = sync; + return FlushWAL(options); } + Status FlushWAL(const FlushWALOptions& options) override; + virtual Status FlushWAL(const WriteOptions& write_options, bool sync); bool WALBufferIsEmpty(); Status SyncWAL() override; @@ -497,6 +510,9 @@ class DBImpl : public DB { Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, std::string* ts_low) override; + Status GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family, + std::string* newest_timestamp) override; + Status GetDbIdentity(std::string& identity) const override; virtual Status GetDbIdentityFromIdentityFile(const IOOptions& opts, @@ -530,11 +546,11 @@ class DBImpl : public DB { // Get the known flushed sizes of WALs that might still be written to // or have pending sync. - // NOTE: unlike alive_log_files_, this function includes WALs that might + // NOTE: unlike alive_wal_files_, this function includes WALs that might // be obsolete (but not obsolete to a pending Checkpoint) and not yet fully // synced. Status GetOpenWalSizes(std::map& number_to_size); - Status GetCurrentWalFile(std::unique_ptr* current_log_file) override; + Status GetCurrentWalFile(std::unique_ptr* current_wal_file) override; Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override; Status GetUpdatesSince( @@ -542,7 +558,7 @@ class DBImpl : public DB { const TransactionLogIterator::ReadOptions& read_options = TransactionLogIterator::ReadOptions()) override; Status DeleteFilesInRanges(ColumnFamilyHandle* column_family, - const RangePtr* ranges, size_t n, + const RangeOpt* ranges, size_t n, bool include_end = true); void GetLiveFilesMetaData(std::vector* metadata) override; @@ -558,6 +574,11 @@ class DBImpl : public DB { void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, ColumnFamilyMetaData* metadata) override; + // Get column family metadata with filtering based on key range and level + void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + const GetColumnFamilyMetaDataOptions& options, + ColumnFamilyMetaData* metadata) override; + void GetAllColumnFamilyMetaData( std::vector* metadata) override; @@ -651,6 +672,11 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family, const Range* range, std::size_t n, TablePropertiesCollection* props) override; + Status GetPropertiesOfTablesByLevel( + ColumnFamilyHandle* column_family, + std::vector>* props_by_level) + override; + // ---- End of implementations of the DB interface ---- SystemClock* GetSystemClock() const; @@ -787,10 +813,6 @@ class DBImpl : public DB { // being detected. const Snapshot* GetSnapshotForWriteConflictBoundary(); - // checks if all live files exist on file system and that their file sizes - // match to our in-memory records - virtual Status CheckConsistency(); - // max_file_num_to_ignore allows bottom level compaction to filter out newly // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will // disable the filtering @@ -1063,7 +1085,7 @@ class DBImpl : public DB { void AddToLogsToFreeQueue(log::Writer* log_writer) { mutex_.AssertHeld(); - logs_to_free_queue_.push_back(log_writer); + wals_to_free_queue_.push_back(log_writer); } void AddSuperVersionsToFreeQueue(SuperVersion* sv) { @@ -1073,10 +1095,7 @@ class DBImpl : public DB { void SetSnapshotChecker(SnapshotChecker* snapshot_checker); // Fill JobContext with snapshot information needed by flush and compaction. - void GetSnapshotContext(JobContext* job_context, - std::vector* snapshot_seqs, - SequenceNumber* earliest_write_conflict_snapshot, - SnapshotChecker** snapshot_checker); + void InitSnapshotContext(JobContext* job_context); // Not thread-safe. void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback); @@ -1128,7 +1147,7 @@ class DBImpl : public DB { bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } bool TEST_IsLogGettingFlushed() { - return alive_log_files_.begin()->getting_flushed; + return alive_wal_files_.begin()->getting_flushed; } Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); @@ -1208,7 +1227,9 @@ class DBImpl : public DB { uint64_t TEST_LogfileNumber(); - uint64_t TEST_total_log_size() const { return total_log_size_; } + uint64_t TEST_wals_total_size() const { + return wals_total_size_.LoadRelaxed(); + } void TEST_GetAllBlockCaches(std::unordered_set* cache_set); @@ -1267,27 +1288,24 @@ class DBImpl : public DB { // flush LOG out of application buffer void FlushInfoLog(); - // record current sequence number to time mapping. If - // populate_historical_seconds > 0 then pre-populate all the - // sequence numbers from [1, last] to map to [now minus - // populate_historical_seconds, now]. - void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds); - - // Everytime DB's seqno to time mapping changed (which already hold the db - // mutex), we install a new SuperVersion in each column family with a shared - // copy of the new mapping while holding the db mutex. - // This is done for all column families even though the column family does not - // explicitly enabled the - // `preclude_last_level_data_seconds` or `preserve_internal_time_seconds` - // features. - // This mapping supports iterators to fulfill the - // "rocksdb.iterator.write-time" iterator property for entries in memtables. - // - // Since this new SuperVersion doesn't involve an LSM tree shape change, we - // don't schedule work after installing this SuperVersion. It returns the used - // `SuperVersionContext` for clean up after release mutex. - void InstallSeqnoToTimeMappingInSV( - std::vector* sv_contexts); + // For the background timer job + void RecordSeqnoToTimeMapping(); + + // Compactions rely on an event triggers like flush/compaction/SetOptions. + // We need to trigger periodic compactions even when there is no such trigger. + // This function checks and schedules available compactions and will run + // periodically. + void TriggerPeriodicCompaction(); + + // REQUIRES: DB mutex held + std::pair GetSeqnoToTimeSample() const; + + // REQUIRES: DB mutex held or during open + void EnsureSeqnoToTimeMapping(const MinAndMaxPreserveSeconds& preserve_secs); + + // Only called during open + void PrepopulateSeqnoToTimeMapping( + const MinAndMaxPreserveSeconds& preserve_secs); // Interface to block and signal the DB in case of stalling writes by // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface. @@ -1375,16 +1393,19 @@ class DBImpl : public DB { // State below is protected by mutex_ // With two_write_queues enabled, some of the variables that accessed during - // WriteToWAL need different synchronization: log_empty_, alive_log_files_, - // logs_, logfile_number_. Refer to the definition of each variable below for + // WriteToWAL need different synchronization: wal_empty_, alive_wal_files_, + // logs_, cur_wal_number_. Refer to the definition of each variable below for // more description. // + // Protects access to most ColumnFamilyData methods, see more in comment for + // each method. + // // `mutex_` can be a hot lock in some workloads, so it deserves dedicated // cachelines. mutable CacheAlignedInstrumentedMutex mutex_; - ColumnFamilyHandleImpl* default_cf_handle_; - InternalStats* default_cf_internal_stats_; + ColumnFamilyHandleImpl* default_cf_handle_ = nullptr; + InternalStats* default_cf_internal_stats_ = nullptr; // table_cache_ provides its own synchronization std::shared_ptr table_cache_; @@ -1396,7 +1417,7 @@ class DBImpl : public DB { // only used for dynamically adjusting max_total_wal_size. it is a sum of // [write_buffer_size * max_write_buffer_number] over all column families - std::atomic max_total_in_memory_state_; + std::atomic max_total_in_memory_state_ = 0; // The options to access storage files const FileOptions file_options_; @@ -1423,14 +1444,14 @@ class DBImpl : public DB { // Each flush or compaction gets its own job id. this counter makes sure // they're unique - std::atomic next_job_id_; + std::atomic next_job_id_ = 1; - std::atomic shutting_down_; + std::atomic shutting_down_ = false; // No new background jobs can be queued if true. This is used to prevent new // background jobs from being queued after WaitForCompact() completes waiting // all background jobs then attempts to close when close_db_ option is true. - bool reject_new_background_jobs_; + bool reject_new_background_jobs_ = false; // RecoveryContext struct stores the context about version edits along // with corresponding column_family_data and column_family_options. @@ -1528,11 +1549,11 @@ class DBImpl : public DB { // ingests `wbwi` is done. // @param memtable_updated Whether the same write that ingests wbwi has // updated memtable. This is useful for determining whether to set bg - // error when IngestWBWI fails. - Status IngestWBWI(std::shared_ptr wbwi, - const WBWIMemTable::SeqnoRange& assigned_seqno, - uint64_t min_prep_log, SequenceNumber last_seqno, - bool memtable_updated, bool ignore_missing_cf); + // error when IngestWBWIAsMemtable fails. + Status IngestWBWIAsMemtable(std::shared_ptr wbwi, + const WBWIMemTable::SeqnoRange& assigned_seqno, + uint64_t min_prep_log, SequenceNumber last_seqno, + bool memtable_updated, bool ignore_missing_cf); // If disable_memtable is set the application logic must guarantee that the // batch will still be skipped from memtable during the recovery. An excption @@ -1562,18 +1583,17 @@ class DBImpl : public DB { Status WriteImpl(const WriteOptions& options, WriteBatch* updates, WriteCallback* callback = nullptr, UserWriteCallback* user_write_cb = nullptr, - uint64_t* log_used = nullptr, uint64_t log_ref = 0, + uint64_t* wal_used = nullptr, uint64_t log_ref = 0, bool disable_memtable = false, uint64_t* seq_used = nullptr, size_t batch_cnt = 0, PreReleaseCallback* pre_release_callback = nullptr, PostMemTableCallback* post_memtable_callback = nullptr, - std::shared_ptr wbwi = nullptr, - uint64_t min_prep_log = 0); + std::shared_ptr wbwi = nullptr); Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates, WriteCallback* callback = nullptr, UserWriteCallback* user_write_cb = nullptr, - uint64_t* log_used = nullptr, uint64_t log_ref = 0, + uint64_t* wal_used = nullptr, uint64_t log_ref = 0, bool disable_memtable = false, uint64_t* seq_used = nullptr); @@ -1600,7 +1620,7 @@ class DBImpl : public DB { Status WriteImplWALOnly( WriteThread* write_thread, const WriteOptions& options, WriteBatch* updates, WriteCallback* callback, - UserWriteCallback* user_write_cb, uint64_t* log_used, + UserWriteCallback* user_write_cb, uint64_t* wal_used, const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, const PublishLastSeq publish_last_seq, const bool disable_memtable); @@ -1761,9 +1781,9 @@ class DBImpl : public DB { } }; - struct LogFileNumberSize { - explicit LogFileNumberSize(uint64_t _number) : number(_number) {} - LogFileNumberSize() {} + struct WalFileNumberSize { + explicit WalFileNumberSize(uint64_t _number) : number(_number) {} + WalFileNumberSize() {} void AddSize(uint64_t new_size) { size += new_size; } uint64_t number; uint64_t size = 0; @@ -1785,6 +1805,13 @@ class DBImpl : public DB { if (writer->file()) { // TODO: plumb Env::IOActivity, Env::IOPriority s = writer->WriteBuffer(WriteOptions()); + if (attempt_truncate_size < SIZE_MAX && + attempt_truncate_size < writer->file()->GetFileSize()) { + Status s2 = writer->file()->writable_file()->Truncate( + attempt_truncate_size, IOOptions{}, nullptr); + // This is just a best effort attempt + s2.PermitUncheckedError(); + } } delete writer; writer = nullptr; @@ -1817,6 +1844,11 @@ class DBImpl : public DB { getting_synced = false; } + void SetAttemptTruncateSize(uint64_t size) { + assert(attempt_truncate_size == SIZE_MAX); + attempt_truncate_size = size; + } + uint64_t number; // Visual Studio doesn't support deque's member to be noncopyable because // of a std::unique_ptr as a member. @@ -1829,15 +1861,20 @@ class DBImpl : public DB { // to be persisted even if appends happen during sync so it can be used for // tracking the synced size in MANIFEST. uint64_t pre_sync_size = 0; + // When < SIZE_MAX, attempt to truncate the WAL to this size on close, + // because a bad entry was written to it beyond that point and it likely + // won't be recoverable with the bad entry. + uint64_t attempt_truncate_size = SIZE_MAX; }; - struct LogContext { - explicit LogContext(bool need_sync = false) - : need_log_sync(need_sync), need_log_dir_sync(need_sync) {} - bool need_log_sync = false; - bool need_log_dir_sync = false; + struct WalContext { + explicit WalContext(bool need_sync = false) + : need_wal_sync(need_sync), need_wal_dir_sync(need_sync) {} + bool need_wal_sync = false; + bool need_wal_dir_sync = false; log::Writer* writer = nullptr; - LogFileNumberSize* log_file_number_size = nullptr; + WalFileNumberSize* wal_file_number_size = nullptr; + uint64_t prev_size = SIZE_MAX; }; // PurgeFileInfo is a structure to hold information of files to be deleted in @@ -1929,12 +1966,19 @@ class DBImpl : public DB { }; struct PrepickedCompaction { // background compaction takes ownership of `compaction`. + // TODO(hx235): consider using std::shared_ptr for easier ownership + // management Compaction* compaction; // caller retains ownership of `manual_compaction_state` as it is reused // across background compactions. ManualCompactionState* manual_compaction_state; // nullptr if non-manual // task limiter token is requested during compaction picking. std::unique_ptr task_token; + // If true, `compaction` is picked temporarily to express compaction intent + // and will be released before re-picking a real compaction based on the + // updated LSM shape when thread associated with `compaction` is ready to + // run + bool need_repick; }; struct CompactionArg { @@ -1979,7 +2023,7 @@ class DBImpl : public DB { // Follow-up work to user creating a column family or (families) Status WrapUpCreateColumnFamilies( - const ReadOptions& read_options, const WriteOptions& write_options, + const WriteOptions& write_options, const std::vector& cf_options); Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); @@ -2025,14 +2069,13 @@ class DBImpl : public DB { // Flush the in-memory write buffer to storage. Switches to a new // log-file/memtable and writes a new descriptor iff successful. Then // installs a new super version for the column family. - Status FlushMemTableToOutputFile( - ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - bool* madeProgress, JobContext* job_context, FlushReason flush_reason, - SuperVersionContext* superversion_context, - std::vector& snapshot_seqs, - SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, - Env::Priority thread_pri); + Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, + bool* madeProgress, JobContext* job_context, + FlushReason flush_reason, + SuperVersionContext* superversion_context, + LogBuffer* log_buffer, + Env::Priority thread_pri); // Flush the memtables of (multiple) column families to multiple files on // persistent storage. @@ -2045,10 +2088,10 @@ class DBImpl : public DB { JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); // REQUIRES: log_numbers are sorted in ascending order - // corrupted_log_found is set to true if we recover from a corrupted log file. + // corrupted_wal_found is set to true if we recover from a corrupted log file. Status RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, bool read_only, - bool is_retry, bool* corrupted_log_found, + bool is_retry, bool* corrupted_wal_found, RecoveryContext* recovery_ctx); void SetupLogFilesRecovery( @@ -2138,6 +2181,11 @@ class DBImpl : public DB { bool flushed, std::unordered_map* version_edits, RecoveryContext* recovery_ctx); + // Check that DB sequence number is not set back during recovery between + // replaying of WAL files and between replaying of WriteBatches. + Status CheckSeqnoNotSetBackDuringRecovery(SequenceNumber prev_next_seqno, + SequenceNumber current_next_seqno); + void FinishLogFilesRecovery(int job_id, const Status& status); // The following two methods are used to flush a memtable to // storage. The first one is used at database RecoveryTime (when the @@ -2151,12 +2199,12 @@ class DBImpl : public DB { // log file to its actual size, thereby freeing preallocated space. // Return success even if truncate fails Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, - LogFileNumberSize* log); + WalFileNumberSize* log); - // Restore alive_log_files_ and total_log_size_ after recovery. + // Restore alive_wal_files_ and wals_total_size_ after recovery. // It needs to run only when there's no flush during recovery // (e.g. avoid_flush_during_recovery=true). May also trigger flush - // in case total_log_size > max_total_wal_size. + // in case wals_total_size > max_total_wal_size. Status RestoreAliveLogFiles(const std::vector& log_numbers); // num_bytes: for slowdown case, delay time is calculated based on @@ -2305,7 +2353,7 @@ class DBImpl : public DB { // REQUIRES: mutex locked Status PreprocessWrite(const WriteOptions& write_options, - LogContext* log_context, WriteContext* write_context); + WalContext* log_context, WriteContext* write_context); // Merge write batches in the write group into merged_batch. // Returns OK if merge is successful. @@ -2316,20 +2364,21 @@ class DBImpl : public DB { IOStatus WriteToWAL(const WriteBatch& merged_batch, const WriteOptions& write_options, - log::Writer* log_writer, uint64_t* log_used, + log::Writer* log_writer, uint64_t* wal_used, uint64_t* log_size, - LogFileNumberSize& log_file_number_size, + WalFileNumberSize& wal_file_number_size, SequenceNumber sequence); - IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group, - log::Writer* log_writer, uint64_t* log_used, - bool need_log_sync, bool need_log_dir_sync, - SequenceNumber sequence, - LogFileNumberSize& log_file_number_size); + IOStatus WriteGroupToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* wal_used, + bool need_wal_sync, bool need_wal_dir_sync, + SequenceNumber sequence, + WalFileNumberSize& wal_file_number_size); - IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, - uint64_t* log_used, - SequenceNumber* last_sequence, size_t seq_inc); + IOStatus ConcurrentWriteGroupToWAL(const WriteThread::WriteGroup& write_group, + uint64_t* wal_used, + SequenceNumber* last_sequence, + size_t seq_inc); // Used by WriteImpl to update bg_error_ if paranoid check is enabled. // Caller must hold mutex_. @@ -2343,7 +2392,7 @@ class DBImpl : public DB { void WALIOStatusCheck(const IOStatus& status); // Used by WriteImpl to update bg_error_ in case of memtable insert error. - void MemTableInsertStatusCheck(const Status& memtable_insert_status); + void HandleMemTableInsertFailure(const Status& nonok_memtable_insert_status); Status CompactFilesImpl(const CompactionOptions& compact_options, ColumnFamilyData* cfd, Version* version, @@ -2353,6 +2402,14 @@ class DBImpl : public DB { JobContext* job_context, LogBuffer* log_buffer, CompactionJobInfo* compaction_job_info); + // Helper function to perform trivial move by updating manifest metadata + // without rewriting data files. This is called when IsTrivialMove() is true. + // REQUIRES: mutex held + // Returns: Status of the trivial move operation + Status PerformTrivialMove(Compaction& c, LogBuffer* log_buffer, + bool& compaction_released, size_t& moved_files, + size_t& moved_bytes); + // REQUIRES: mutex unlocked void TrackOrUntrackFiles(const std::vector& existing_data_files, bool track); @@ -2428,6 +2485,8 @@ class DBImpl : public DB { bool* flush_rescheduled_to_retain_udt, Env::Priority thread_pri); + Compaction* CreateIntendedCompactionForwardedToBottomPriorityPool( + Compaction* c); bool EnoughRoomForCompaction(ColumnFamilyData* cfd, const std::vector& inputs, bool* sfm_bookkeeping, LogBuffer* log_buffer); @@ -2450,9 +2509,7 @@ class DBImpl : public DB { // Cancel scheduled periodic tasks Status CancelPeriodicTaskScheduler(); - Status RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options, - const WriteOptions& write_options, - bool is_new_db); + Status RegisterRecordSeqnoTimeWorker(); void PrintStatistics(); @@ -2518,12 +2575,21 @@ class DBImpl : public DB { // Background threads call this function, which is just a wrapper around // the InstallSuperVersion() function. Background threads carry - // sv_context which can have new_superversion already - // allocated. + // sv_context to allow allocation of SuperVersion object outside of holding + // the DB mutex. // All ColumnFamily state changes go through this function. Here we analyze // the new state and we schedule background work if we detect that the new // state needs flush or compaction. - void InstallSuperVersionAndScheduleWork(ColumnFamilyData* cfd, + // See also InstallSuperVersionForConfigChange(). + void InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + std::optional> + new_seqno_to_time_mapping = {}); + + // A variant of InstallSuperVersionAndScheduleWork() that must be used for + // new CFs or for changes to mutable_cf_options. This is so that it can + // update seqno_to_time_mapping cached for the new SuperVersion as relevant. + void InstallSuperVersionForConfigChange(ColumnFamilyData* cfd, SuperVersionContext* sv_context); bool GetIntPropertyInternal(ColumnFamilyData* cfd, @@ -2538,7 +2604,7 @@ class DBImpl : public DB { bool ShouldntRunManualCompaction(ManualCompactionState* m); bool HaveManualCompaction(ColumnFamilyData* cfd); bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); - void UpdateDeletionCompactionStats(const std::unique_ptr& c); + void UpdateFIFOCompactionStatus(const std::unique_ptr& c); // May open and read table files for table property. // Should not be called while holding mutex_. @@ -2688,8 +2754,13 @@ class DBImpl : public DB { const std::vector& column_families, ErrorIteratorFuncType error_iterator_func); + bool ShouldPickCompaction(bool is_prepicked, + const PrepickedCompaction* prepicked_compaction); + + void ResetBottomPriCompactionIntent(ColumnFamilyData* cfd, + std::unique_ptr& c); // Lock over the persistent DB state. Non-nullptr iff successfully acquired. - FileLock* db_lock_; + FileLock* db_lock_ = nullptr; // Guards changes to DB and CF options to ensure consistency between // * In-memory options objects @@ -2703,20 +2774,28 @@ class DBImpl : public DB { // Guards reads and writes to in-memory stats_history_. InstrumentedMutex stats_history_mutex_; - // In addition to mutex_, log_write_mutex_ protects writes to logs_ and - // logfile_number_. With two_write_queues it also protects alive_log_files_, - // and log_empty_. Refer to the definition of each variable below for more + // In addition to mutex_, wal_write_mutex_ protects writes to logs_ and + // cur_wal_number_. With two_write_queues it also protects alive_wal_files_, + // and wal_empty_. Refer to the definition of each variable below for more // details. - // Note: to avoid deadlock, if needed to acquire both log_write_mutex_ and - // mutex_, the order should be first mutex_ and then log_write_mutex_. - InstrumentedMutex log_write_mutex_; + // Note: to avoid deadlock, if needed to acquire both wal_write_mutex_ and + // mutex_, the order should be first mutex_ and then wal_write_mutex_. + InstrumentedMutex wal_write_mutex_; // If zero, manual compactions are allowed to proceed. If non-zero, manual // compactions may still be running, but will quickly fail with // `Status::Incomplete`. The value indicates how many threads have paused // manual compactions. It is accessed in read mode outside the DB mutex in // compaction code paths. - std::atomic manual_compaction_paused_; + std::atomic manual_compaction_paused_ = false; + + // If non-zero, all compaction jobs (background automatic compactions, + // manual compactions via CompactRange, and foreground CompactFiles calls) + // are being aborted. Compactions will be signaled to stop. Any new + // compaction job would fail immediately. The value indicates how many threads + // have called AbortAllCompactions(). It is accessed in read mode outside the + // DB mutex in compaction code paths. + std::atomic compaction_aborted_ = 0; // This condition variable is signaled on these conditions: // * whenever bg_compaction_scheduled_ goes down to 0 @@ -2732,106 +2811,114 @@ class DBImpl : public DB { // * whenever SetOptions successfully updates options. // * whenever a column family is dropped. InstrumentedCondVar bg_cv_; - // Writes are protected by locking both mutex_ and log_write_mutex_, and reads - // must be under either mutex_ or log_write_mutex_. Since after ::Open, - // logfile_number_ is currently updated only in write_thread_, it can be read + + ColumnFamilyHandleImpl* persist_stats_cf_handle_ = nullptr; + + bool persistent_stats_cfd_exists_ = true; + + // Writes are protected by locking both mutex_ and wal_write_mutex_, and reads + // must be under either mutex_ or wal_write_mutex_. Since after ::Open, + // cur_wal_number_ is currently updated only in write_thread_, it can be read // from the same write_thread_ without any locks. - uint64_t logfile_number_; + uint64_t cur_wal_number_ = 0; + // Log files that we can recycle. Must be protected by db mutex_. - std::deque log_recycle_files_; + std::deque wal_recycle_files_; + // The minimum log file number taht can be recycled, if log recycling is // enabled. This is used to ensure that log files created by previous // instances of the database are not recycled, as we cannot be sure they // were created in the recyclable format. - uint64_t min_log_number_to_recycle_; - // Protected by log_write_mutex_. - bool log_dir_synced_; - // Without two_write_queues, read and writes to log_empty_ are protected by + uint64_t min_wal_number_to_recycle_ = 0; + + // Protected by wal_write_mutex_. + bool wal_dir_synced_ = false; + + // Without two_write_queues, read and writes to wal_empty_ are protected by // mutex_. Since it is currently updated/read only in write_thread_, it can be // accessed from the same write_thread_ without any locks. With // two_write_queues writes, where it can be updated in different threads, - // read and writes are protected by log_write_mutex_ instead. This is to avoid - // expensive mutex_ lock during WAL write, which update log_empty_. - bool log_empty_; - - ColumnFamilyHandleImpl* persist_stats_cf_handle_; - - bool persistent_stats_cfd_exists_ = true; + // read and writes are protected by wal_write_mutex_ instead. This is to avoid + // expensive mutex_ lock during WAL write, which update wal_empty_. + bool wal_empty_ = true; // The current WAL file and those that have not been found obsolete from // memtable flushes. A WAL not on this list might still be pending writer - // flush and/or sync and close and might still be in logs_. alive_log_files_ - // is protected by mutex_ and log_write_mutex_ with details as follows: + // flush and/or sync and close and might still be in logs_. alive_wal_files_ + // is protected by mutex_ and wal_write_mutex_ with details as follows: // 1. read by FindObsoleteFiles() which can be called in either application - // thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are + // thread or RocksDB bg threads, both mutex_ and wal_write_mutex_ are // held. - // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_ + // 2. pop_front() by FindObsoleteFiles(), both mutex_ and wal_write_mutex_ // are held. // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles() // (actually called by Open()), only mutex_ is held because at this point, // the DB::Open() call has not returned success to application, and the // only other thread(s) that can conflict are bg threads calling - // FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_ - // are held when accessing alive_log_files_. + // FindObsoleteFiles() which ensure that both mutex_ and wal_write_mutex_ + // are held when accessing alive_wal_files_. // 4. read by DBImpl::Open() is protected by mutex_. - // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are + // 5. push_back() by SwitchMemtable(). Both mutex_ and wal_write_mutex_ are // held. This is done by the write group leader. Note that in the case of // two-write-queues, another WAL-only write thread can be writing to the // WAL concurrently. See 9. - // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is + // 6. read by SwitchWAL() with both mutex_ and wal_write_mutex_ held. This is // done by write group leader. // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of - // two-write-queues. Only log_write_mutex_ is held to protect concurrent + // two-write-queues. Only wal_write_mutex_ is held to protect concurrent // pop_front() by FindObsoleteFiles(). - // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_ + // 8. read by PreprocessWrite() by the write group leader. wal_write_mutex_ // is held to protect the data structure from concurrent pop_front() by // FindObsoleteFiles(). // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case - // of two-write-queues. Only log_write_mutex_ is held. This suffices to + // of two-write-queues. Only wal_write_mutex_ is held. This suffices to // protect the data structure from concurrent push_back() by current // write group leader as well as pop_front() by FindObsoleteFiles(). - std::deque alive_log_files_; + std::deque alive_wal_files_; + + // Total size of all "alive" WALs (for easy access without synchronization) + RelaxedAtomic wals_total_size_{0}; // Log files that aren't fully synced, and the current log file. // Synchronization: // 1. read by FindObsoleteFiles() which can be called either in application - // thread or RocksDB bg threads. log_write_mutex_ is always held, while + // thread or RocksDB bg threads. wal_write_mutex_ is always held, while // some reads are performed without mutex_. - // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held. - // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_. - // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex. + // 2. pop_front() by FindObsoleteFiles() with only wal_write_mutex_ held. + // 3. read by DBImpl::Open() with both mutex_ and wal_write_mutex_. + // 4. emplace_back() by DBImpl::Open() with both mutex_ and wal_write_mutex. // Note that at this point, DB::Open() has not returned success to // application, thus the only other thread(s) that can conflict are bg // threads calling FindObsoleteFiles(). See 1. - // 5. iteration and clear() from CloseHelper() always hold log_write_mutex + // 5. iteration and clear() from CloseHelper() always hold wal_write_mutex // and mutex_. // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only - // log_write_mutex_. These two can be called by application threads after + // wal_write_mutex_. These two can be called by application threads after // DB::Open() returns success to applications. - // 7. read by SyncWAL(), another API, protected by only log_write_mutex_. + // 7. read by SyncWAL(), another API, protected by only wal_write_mutex_. // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by - // log_write_mutex_. - // 9. erase() by MarkLogsSynced() protected by log_write_mutex_. - // 10. read by SyncClosedWals() protected by only log_write_mutex_. This can + // wal_write_mutex_. + // 9. erase() by MarkLogsSynced() protected by wal_write_mutex_. + // 10. read by SyncClosedWals() protected by only wal_write_mutex_. This can // happen in bg flush threads after DB::Open() returns success to // applications. // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite() - // holds only the log_write_mutex_. This is done by the write group + // holds only the wal_write_mutex_. This is done by the write group // leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced() - // can happen concurrently. This is fine because log_write_mutex_ is used + // can happen concurrently. This is fine because wal_write_mutex_ is used // by all parties. See 2, 5, 9. // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and - // log_write_mutex_. This happens in the write group leader. + // wal_write_mutex_. This happens in the write group leader. // 13. emplace_back() by SwitchMemtable() hold both mutex_ and - // log_write_mutex_. This happens in the write group leader. Can conflict + // wal_write_mutex_. This happens in the write group leader. Can conflict // with bg threads calling FindObsoleteFiles(), MarkLogsSynced(), // SyncClosedWals(), etc. as well as application threads calling // FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties - // require at least log_write_mutex_. + // require at least wal_write_mutex_. // 14. iteration called in WriteToWAL(write_group) protected by - // log_write_mutex_. This is done by write group leader when + // wal_write_mutex_. This is done by write group leader when // two-write-queues is disabled and write needs to sync logs. - // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_. + // 15. back() called in ConcurrentWriteToWAL() protected by wal_write_mutex_. // This can be done by the write group leader if two-write-queues is // enabled. It can also be done by another WAL-only write thread. // @@ -2848,23 +2935,22 @@ class DBImpl : public DB { std::deque logs_; // Signaled when getting_synced becomes false for some of the logs_. - InstrumentedCondVar log_sync_cv_; + InstrumentedCondVar wal_sync_cv_; // This is the app-level state that is written to the WAL but will be used // only during recovery. Using this feature enables not writing the state to // memtable on normal writes and hence improving the throughput. Each new // write of the state will replace the previous state entirely even if the // keys in the two consecutive states do not overlap. - // It is protected by log_write_mutex_ when two_write_queues_ is enabled. + // It is protected by wal_write_mutex_ when two_write_queues_ is enabled. // Otherwise only the heaad of write_thread_ can access it. WriteBatch cached_recoverable_state_; std::atomic cached_recoverable_state_empty_ = {true}; - std::atomic total_log_size_; // If this is non-empty, we need to delete these log files in background - // threads. Protected by log_write_mutex_. - autovector logs_to_free_; + // threads. Protected by wal_write_mutex_. + autovector wals_to_free_; - bool is_snapshot_supported_; + bool is_snapshot_supported_ = true; std::map> stats_history_; @@ -2888,7 +2974,7 @@ class DBImpl : public DB { // sleep if it uses up the quota. // Note: This is to protect memtable and compaction. If the batch only writes // to the WAL its size need not to be included in this. - uint64_t last_batch_group_size_; + uint64_t last_batch_group_size_ = 0; FlushScheduler flush_scheduler_; @@ -2947,32 +3033,32 @@ class DBImpl : public DB { std::unordered_set files_grabbed_for_purge_; // A queue to store log writers to close. Protected by db mutex_. - std::deque logs_to_free_queue_; + std::deque wals_to_free_queue_; std::deque superversions_to_free_queue_; - int unscheduled_flushes_; + int unscheduled_flushes_ = 0; - int unscheduled_compactions_; + int unscheduled_compactions_ = 0; // count how many background compactions are running or have been scheduled in // the BOTTOM pool - int bg_bottom_compaction_scheduled_; + int bg_bottom_compaction_scheduled_ = 0; // count how many background compactions are running or have been scheduled - int bg_compaction_scheduled_; + int bg_compaction_scheduled_ = 0; // stores the number of compactions are currently running - int num_running_compactions_; + int num_running_compactions_ = 0; // number of background memtable flush jobs, submitted to the HIGH pool - int bg_flush_scheduled_; + int bg_flush_scheduled_ = 0; // stores the number of flushes are currently running - int num_running_flushes_; + int num_running_flushes_ = 0; // number of background obsolete file purge jobs, submitted to the HIGH pool - int bg_purge_scheduled_; + int bg_purge_scheduled_ = 0; std::deque manual_compaction_dequeue_; @@ -2982,11 +3068,11 @@ class DBImpl : public DB { // This enables two different threads to call // EnableFileDeletions() and DisableFileDeletions() // without any synchronization - int disable_delete_obsolete_files_; + int disable_delete_obsolete_files_ = 0; // Number of times FindObsoleteFiles has found deletable files and the // corresponding call to PurgeObsoleteFiles has not yet finished. - int pending_purge_obsolete_files_; + int pending_purge_obsolete_files_ = 0; // last time when DeleteObsoleteFiles with full scan was executed. Originally // initialized with startup time. @@ -2998,12 +3084,12 @@ class DBImpl : public DB { // The mutex used by switch_cv_. mutex_ should be acquired beforehand. std::mutex switch_mutex_; // Number of threads intending to write to memtable - std::atomic pending_memtable_writes_ = {}; + std::atomic pending_memtable_writes_{0}; // A flag indicating whether the current rocksdb database has any // data that is not yet persisted into either WAL or SST file. // Used when disableWAL is true. - std::atomic has_unpersisted_data_; + std::atomic has_unpersisted_data_{false}; // if an attempt was made to flush all column families that // the oldest log depends on but uncommitted data in the oldest @@ -3011,26 +3097,26 @@ class DBImpl : public DB { // We must attempt to free the dependent memtables again // at a later time after the transaction in the oldest // log is fully commited. - bool unable_to_release_oldest_log_; + bool unable_to_release_oldest_log_{false}; // Number of running IngestExternalFile() or CreateColumnFamilyWithImport() // calls. // REQUIRES: mutex held - int num_running_ingest_file_; + int num_running_ingest_file_ = 0; WalManager wal_manager_; // A value of > 0 temporarily disables scheduling of background work - int bg_work_paused_; + int bg_work_paused_ = 0; // A value of > 0 temporarily disables scheduling of background compaction - int bg_compaction_paused_; + int bg_compaction_paused_ = 0; // Guard against multiple concurrent refitting - bool refitting_level_; + bool refitting_level_ = false; // Indicate DB was opened successfully - bool opened_successfully_; + bool opened_successfully_ = false; // The min threshold to triggere bottommost compaction for removing // garbages, among all column families. @@ -3076,13 +3162,13 @@ class DBImpl : public DB { // error recovery from going on in parallel. The latter, shutting_down_, // is set a little later during the shutdown after scheduling memtable // flushes - std::atomic shutdown_initiated_; + std::atomic shutdown_initiated_{false}; // Flag to indicate whether sst_file_manager object was allocated in // DB::Open() or passed to us bool own_sfm_; // Flag to check whether Close() has been called on this DB - bool closed_; + bool closed_ = false; // save the closing status, for re-calling the close() Status closing_status_; // mutex for DB::Close() @@ -3118,7 +3204,7 @@ class DBImpl : public DB { // The number of LockWAL called without matching UnlockWAL call. // See also lock_wal_write_token_ - uint32_t lock_wal_count_; + uint32_t lock_wal_count_ = 0; }; class GetWithTimestampReadCallback : public ReadCallback { diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 0cbb6c79e382..ab136b57b505 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -13,12 +13,14 @@ #include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "db/event_helpers.h" +#include "file/file_util.h" #include "file/sst_file_manager_impl.h" #include "logging/logging.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" +#include "options/options_helper.h" #include "rocksdb/file_system.h" #include "rocksdb/io_status.h" #include "rocksdb/options.h" @@ -143,10 +145,7 @@ IOStatus DBImpl::SyncClosedWals(const WriteOptions& write_options, Status DBImpl::FlushMemTableToOutputFile( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, bool* made_progress, JobContext* job_context, FlushReason flush_reason, - SuperVersionContext* superversion_context, - std::vector& snapshot_seqs, - SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + SuperVersionContext* superversion_context, LogBuffer* log_buffer, Env::Priority thread_pri) { mutex_.AssertHeld(); assert(cfd); @@ -168,7 +167,7 @@ Status DBImpl::FlushMemTableToOutputFile( // had not been committed yet. Make sure we sync them to keep the persisted // WAL state at least as new as the persisted SST state. const bool needs_to_sync_closed_wals = - logfile_number_ > 0 && + cur_wal_number_ > 0 && (versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1 || allow_2pc()); @@ -210,7 +209,6 @@ Status DBImpl::FlushMemTableToOutputFile( FlushJob flush_job( dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_, - snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, job_context, flush_reason, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U), GetCompressionFlush(cfd->ioptions(), mutable_cf_options), stats_, @@ -224,7 +222,7 @@ Status DBImpl::FlushMemTableToOutputFile( bool need_cancel = false; IOStatus log_io_s = IOStatus::OK(); if (needs_to_sync_closed_wals) { - // SyncClosedWals() may unlock and re-lock the log_write_mutex multiple + // SyncClosedWals() may unlock and re-lock the wal_write_mutex multiple // times. VersionEdit synced_wals; bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress(); @@ -395,11 +393,8 @@ Status DBImpl::FlushMemTablesToOutputFiles( bg_flush_args, made_progress, job_context, log_buffer, thread_pri); } assert(bg_flush_args.size() == 1); - std::vector snapshot_seqs; - SequenceNumber earliest_write_conflict_snapshot; - SnapshotChecker* snapshot_checker; - GetSnapshotContext(job_context, &snapshot_seqs, - &earliest_write_conflict_snapshot, &snapshot_checker); + InitSnapshotContext(job_context); + const auto& bg_flush_arg = bg_flush_args[0]; ColumnFamilyData* cfd = bg_flush_arg.cfd_; // intentional infrequent copy for each flush @@ -410,8 +405,7 @@ Status DBImpl::FlushMemTablesToOutputFiles( FlushReason flush_reason = bg_flush_arg.flush_reason_; Status s = FlushMemTableToOutputFile( cfd, mutable_cf_options_copy, made_progress, job_context, flush_reason, - superversion_context, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, log_buffer, thread_pri); + superversion_context, log_buffer, thread_pri); return s; } @@ -446,12 +440,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( } #endif /* !NDEBUG */ - std::vector snapshot_seqs; - SequenceNumber earliest_write_conflict_snapshot; - SnapshotChecker* snapshot_checker; - GetSnapshotContext(job_context, &snapshot_seqs, - &earliest_write_conflict_snapshot, &snapshot_checker); - + InitSnapshotContext(job_context); autovector distinct_output_dirs; autovector distinct_output_dir_paths; std::vector> jobs; @@ -485,8 +474,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( jobs.emplace_back(new FlushJob( dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, - &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, job_context, flush_reason, log_buffer, + &shutting_down_, job_context, flush_reason, log_buffer, directories_.GetDbDir(), data_dir, GetCompressionFlush(cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, @@ -512,7 +500,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( job_context->job_id, flush_reason); } - if (logfile_number_ > 0) { + if (cur_wal_number_ > 0) { // TODO (yanqin) investigate whether we should sync the closed logs for // single column family case. VersionEdit synced_wals; @@ -528,7 +516,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && !log_io_s.IsColumnFamilyDropped()) { - if (total_log_size_ > 0) { + if (wals_total_size_.LoadRelaxed() > 0) { error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); } else { // If the WAL is empty, we use different error reason @@ -967,6 +955,10 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, return Status::Incomplete(Status::SubCode::kManualCompactionPaused); } + if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kCompactionAborted); + } + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { return Status::Incomplete(Status::SubCode::kManualCompactionPaused); } @@ -981,7 +973,8 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, std::string begin_str, end_str; auto [begin, end] = - MaybeAddTimestampsToRange(begin_without_ts, end_without_ts, ts_sz, + MaybeAddTimestampsToRange(OptSlice::CopyFromPtr(begin_without_ts), + OptSlice::CopyFromPtr(end_without_ts), ts_sz, &begin_str, &end_str, false /*exclusive_end*/); return CompactRangeInternal( @@ -1122,8 +1115,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, cfd->NumberLevels() > 1) { // Always compact all files together. final_output_level = cfd->NumberLevels() - 1; - // if bottom most level is reserved - if (immutable_db_options_.allow_ingest_behind) { + if (cfd->AllowIngestBehind()) { final_output_level--; } s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, @@ -1392,6 +1384,9 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options, TEST_SYNC_POINT_CALLBACK("TestCompactFiles:PausingManualCompaction:3", static_cast(const_cast*>( &manual_compaction_paused_))); + TEST_SYNC_POINT_CALLBACK("TestCancelCompactFiles:SuccessfulCompaction", + static_cast(const_cast*>( + &manual_compaction_paused_))); { InstrumentedMutexLock l(&mutex_); auto* current = cfd->current(); @@ -1433,6 +1428,57 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options, return s; } +Status DBImpl::PerformTrivialMove(Compaction& c, LogBuffer* log_buffer, + bool& compaction_released, + size_t& moved_files, size_t& moved_bytes) { + mutex_.AssertHeld(); + + ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving %d files to level-%d\n", + c.column_family_data()->GetName().c_str(), + static_cast(c.num_input_files(0)), c.output_level()); + + // Move files to the output level by editing the manifest + for (unsigned int l = 0; l < c.num_input_levels(); l++) { + if (c.level(l) == c.output_level()) { + continue; + } + for (size_t i = 0; i < c.num_input_files(l); i++) { + FileMetaData* f = c.input(l, i); + c.edit()->DeleteFile(c.level(l), f->fd.GetNumber()); + c.edit()->AddFile(c.output_level(), f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, + f->file_checksum, f->file_checksum_func_name, + f->unique_id, f->compensated_range_deletion_size, + f->tail_size, f->user_defined_timestamps_persisted, + f->min_timestamp, f->max_timestamp); + moved_bytes += static_cast(c.input(l, i)->fd.GetFileSize()); + ROCKS_LOG_BUFFER( + log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", + c.column_family_data()->GetName().c_str(), f->fd.GetNumber(), + c.output_level(), f->fd.GetFileSize()); + } + moved_files += c.num_input_files(l); + } + + // Install the new version + const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); + Status status = versions_->LogAndApply( + c.column_family_data(), read_options, write_options, c.edit(), &mutex_, + directories_.GetDbDir(), /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, + [&c, &compaction_released](const Status& s) { + c.ReleaseCompactionFiles(s); + compaction_released = true; + }); + + return status; +} + Status DBImpl::CompactFilesImpl( const CompactionOptions& compact_options, ColumnFamilyData* cfd, Version* version, const std::vector& input_file_names, @@ -1444,7 +1490,17 @@ Status DBImpl::CompactFilesImpl( if (shutting_down_.load(std::memory_order_acquire)) { return Status::ShutdownInProgress(); } - if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + + // triggered by AbortAllCompactions + if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kCompactionAborted); + } + + // triggered by DisableManualCompactions or by user-set canceled flag in + // CompactionOptions + if (manual_compaction_paused_.load(std::memory_order_acquire) > 0 || + (compact_options.canceled && + compact_options.canceled->load(std::memory_order_acquire))) { return Status::Incomplete(Status::SubCode::kManualCompactionPaused); } @@ -1463,7 +1519,7 @@ Status DBImpl::CompactFilesImpl( } } - if (cfd->ioptions().allow_ingest_behind && + if (cfd->AllowIngestBehind() && output_level >= cfd->ioptions().num_levels - 1) { return Status::InvalidArgument( "Exceed the maximum output level defined by " @@ -1503,7 +1559,7 @@ Status DBImpl::CompactFilesImpl( std::unique_ptr c; assert(cfd->compaction_picker()); - c.reset(cfd->compaction_picker()->CompactFiles( + c.reset(cfd->compaction_picker()->PickCompactionForCompactFiles( compact_options, input_files, output_level, version->storage_info(), cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id)); // we already sanitized the set of input files and checked for conflicts @@ -1515,11 +1571,64 @@ Status DBImpl::CompactFilesImpl( // deletion compaction currently not allowed in CompactFiles. assert(!c->deletion_compaction()); - std::vector snapshot_seqs; - SequenceNumber earliest_write_conflict_snapshot; - SnapshotChecker* snapshot_checker; - GetSnapshotContext(job_context, &snapshot_seqs, - &earliest_write_conflict_snapshot, &snapshot_checker); + // Check if this can be a trivial move (metadata-only update) + // Similar to the logic in DBImpl::BackgroundCompaction + // Note: We disable trivial move when compaction_service is present because + // the service expects all compactions to go through CompactionJob for + // tracking + bool is_trivial_move = compact_options.allow_trivial_move && + c->IsTrivialMove() && + immutable_db_options().compaction_service == nullptr; + + if (is_trivial_move) { + // Perform trivial move: just update manifest without rewriting data + TEST_SYNC_POINT("DBImpl::CompactFilesImpl:TrivialMove"); + + bool compaction_released = false; + size_t moved_files = 0; + size_t moved_bytes = 0; + Status status = PerformTrivialMove( + *c.get(), log_buffer, compaction_released, moved_files, moved_bytes); + + if (status.ok()) { + InstallSuperVersionAndScheduleWork( + c->column_family_data(), job_context->superversion_contexts.data()); + + // Populate output file names for trivial move + if (output_file_names != nullptr) { + for (const auto& newf : c->edit()->GetNewFiles()) { + output_file_names->push_back(TableFileName( + c->immutable_options().cf_paths, newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + } + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Trivial move succeeded for %zu files, %zu bytes total\n", + c->column_family_data()->GetName().c_str(), moved_files, moved_bytes); + } else { + if (!compaction_released) { + c->ReleaseCompactionFiles(status); + } + ROCKS_LOG_BUFFER(log_buffer, "[%s] Trivial move failed: %s\n", + c->column_family_data()->GetName().c_str(), + status.ToString().c_str()); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } + + c.reset(); + bg_compaction_scheduled_--; + if (bg_compaction_scheduled_ == 0) { + bg_cv_.SignalAll(); + } + MaybeScheduleFlushOrCompaction(); + + return status; + } + + // Not a trivial move, proceed with full compaction + InitSnapshotContext(job_context); std::unique_ptr::iterator> pending_outputs_inserted_elem( new std::list::iterator( @@ -1533,22 +1642,21 @@ Status DBImpl::CompactFilesImpl( log_buffer, directories_.GetDbDir(), GetDataDir(c->column_family_data(), c->output_path_id()), GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_, - snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, job_context, table_cache_, &event_logger_, c->mutable_cf_options().paranoid_file_checks, c->mutable_cf_options().report_bg_io_stats, dbname_, &compaction_job_stats, Env::Priority::USER, io_tracer_, - kManualCompactionCanceledFalse_, db_id_, db_session_id_, - c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(), - &blob_callback_, &bg_compaction_scheduled_, + kManualCompactionCanceledFalse_, compaction_aborted_, db_id_, + db_session_id_, c->column_family_data()->GetFullHistoryTsLow(), + c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_, &bg_bottom_compaction_scheduled_); // Creating a compaction influences the compaction score because the score // takes running compactions into account (by skipping files that are already // being compacted). Since we just changed compaction score, we recalculate it // here. - version->storage_info()->ComputeCompactionScore(cfd->ioptions(), - c->mutable_cf_options()); + version->storage_info()->ComputeCompactionScore( + cfd->ioptions(), c->mutable_cf_options(), cfd->GetFullHistoryTsLow()); compaction_job.Prepare(std::nullopt /*subcompact to be computed*/); @@ -1611,6 +1719,11 @@ Status DBImpl::CompactFilesImpl( "[%s] [JOB %d] Stopping manual compaction", c->column_family_data()->GetName().c_str(), job_context->job_id); + } else if (status.IsCompactionAborted()) { + // Don't report aborted compaction as error + ROCKS_LOG_INFO( + immutable_db_options_.info_log, "[%s] [JOB %d] Compaction aborted", + c->column_family_data()->GetName().c_str(), job_context->job_id); } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] [JOB %d] Compaction error: %s", @@ -1695,11 +1808,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, } c->SetNotifyOnCompactionCompleted(); + int num_l0_files = c->input_version()->storage_info()->NumLevelFiles(0); // release lock while notifying events mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); { CompactionJobInfo info{}; + info.num_l0_files = num_l0_files; BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, &info); for (const auto& listener : immutable_db_options_.listeners) { listener->OnCompactionBegin(this, info); @@ -1724,11 +1839,13 @@ void DBImpl::NotifyOnCompactionCompleted( return; } + int num_l0_files = cfd->current()->storage_info()->NumLevelFiles(0); // release lock while notifying events mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); { CompactionJobInfo info{}; + info.num_l0_files = num_l0_files; BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, &info); for (const auto& listener : immutable_db_options_.listeners) { listener->OnCompactionCompleted(this, info); @@ -1848,15 +1965,13 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { , LLONG_MAX /* max compaction bytes, not applicable */, 0 /* output path ID, not applicable */, mutable_cf_options.compression, - mutable_cf_options.compression_opts, - mutable_cf_options.default_write_temperature, + mutable_cf_options.compression_opts, Temperature::kUnknown, 0 /* max_subcompactions, not applicable */, {} /* grandparents, not applicable */, std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */, - false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */, - false /* is deletion compaction, not applicable */, - false /* l0_files_might_overlap, not applicable */, - CompactionReason::kRefitLevel)); + CompactionReason::kRefitLevel, "" /* trim_ts */, + -1 /* score, not applicable */, + false /* l0_files_might_overlap, not applicable */)); cfd->compaction_picker()->RegisterCompaction(c.get()); TEST_SYNC_POINT("DBImpl::ReFitLevel:PostRegisterCompaction"); VersionEdit edit; @@ -1871,7 +1986,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, f->tail_size, - f->user_defined_timestamps_persisted); + f->user_defined_timestamps_persisted, f->min_timestamp, + f->max_timestamp); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -1909,10 +2025,6 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { return cfh->cfd()->NumberLevels(); } -int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) { - return 0; -} - int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { auto cfh = static_cast_with_check(column_family); InstrumentedMutexLock l(&mutex_); @@ -2068,6 +2180,17 @@ Status DBImpl::RunManualCompaction( return manual.status; } + if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + // All compactions are being aborted. Return immediately. + int counter = compaction_aborted_.load(std::memory_order_acquire); + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "RunManualCompaction: Aborting due to compaction_aborted_=%d", counter); + manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted); + manual.done = true; + return manual.status; + } + // When a manual compaction arrives, temporarily disable scheduling of // non-manual compactions and wait until the number of scheduled compaction // jobs drops to zero. This used to be needed to ensure that this manual @@ -2092,6 +2215,13 @@ Status DBImpl::RunManualCompaction( // and `CompactRangeOptions::canceled` might not work well together. while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0) { + if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + // Pretend the error came from compaction so the below cleanup/error + // handling code can process it. + manual.done = true; + manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted); + break; + } if (manual_compaction_paused_ > 0 || manual.canceled == true) { // Pretend the error came from compaction so the below cleanup/error // handling code can process it. @@ -2182,6 +2312,7 @@ Status DBImpl::RunManualCompaction( // Don't throttle manual compaction, only count outstanding tasks. assert(false); } + ca->prepicked_compaction->need_repick = false; manual.incomplete = false; if (compaction->bottommost_level() && env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { @@ -2209,7 +2340,12 @@ Status DBImpl::RunManualCompaction( if (!scheduled) { // There is nothing scheduled to wait on, so any cancellation can end the // manual now. - if (manual_compaction_paused_ > 0 || manual.canceled == true) { + if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + // Stop waiting since it was canceled. Pretend the error came from + // compaction so the below cleanup/error handling code can process it. + manual.done = true; + manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted); + } else if (manual_compaction_paused_ > 0 || manual.canceled == true) { // Stop waiting since it was canceled. Pretend the error came from // compaction so the below cleanup/error handling code can process it. manual.done = true; @@ -2711,6 +2847,10 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, // Finish waiting when ALL column families finish flushing memtables. // resuming_from_bg_err indicates whether the caller is trying to resume from // background error or in normal processing. +// Note that the wait finishes when the flush result is installed to column +// families' Versions and persisted in MANIFEST. It doesn't wait until +// SuperVersion to reflect the flush result, except for the case when +// flush_reason is `kExternalFileIngestion`. Status DBImpl::WaitForFlushMemTables( const autovector& cfds, const autovector& flush_memtable_ids, @@ -2784,16 +2924,8 @@ Status DBImpl::WaitForFlushMemTables( Status DBImpl::EnableAutoCompaction( const std::vector& column_family_handles) { - Status s; - for (auto cf_ptr : column_family_handles) { - Status status = - this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}); - if (!status.ok()) { - s = status; - } - } - - return s; + return SetOptions(column_family_handles, + {{"disable_auto_compactions", "false"}}); } // NOTE: Calling DisableManualCompaction() may overwrite the @@ -2831,6 +2963,61 @@ void DBImpl::EnableManualCompaction() { manual_compaction_paused_.fetch_sub(1, std::memory_order_release); } +void DBImpl::AbortAllCompactions() { + InstrumentedMutexLock l(&mutex_); + + // Increment the abort counter to signal all compactions to abort + compaction_aborted_.fetch_add(1, std::memory_order_release); + + TEST_SYNC_POINT("DBImpl::AbortAllCompactions:FlagSet"); + + // Mark all manual compactions as canceled + for (const auto& manual_compaction : manual_compaction_dequeue_) { + manual_compaction->canceled = true; + } + + // Wake up any waiting compaction threads to check the abort signal + bg_cv_.SignalAll(); + + // Wait for all running compactions (both manual and automatic) to finish + // or abort before returning. + // Note: bg_cv_.Wait() releases the mutex while waiting, so other threads + // can make progress and signal when compactions complete. + while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 || + HasPendingManualCompaction()) { + bg_cv_.Wait(); + } +} + +void DBImpl::ResumeAllCompactions() { + InstrumentedMutexLock l(&mutex_); + int before = compaction_aborted_.load(std::memory_order_acquire); + + // Guard against calling Resume without prior Abort + if (before <= 0) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "ResumeAllCompactions called without prior " + "AbortAllCompactions (counter=%d)", + before); + return; + } + + // Decrement the abort counter + compaction_aborted_.fetch_sub(1, std::memory_order_release); + + // As the operation is executed under db mutex, we could just use before value + // to calculate the current value. + int current = before - 1; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "ResumeAllCompactions: counter %d -> %d", before, current); + + // If this is the last resume call (abort counter back to 0), schedule + // compactions that may have been waiting + if (current == 0) { + MaybeScheduleFlushOrCompaction(); + } +} + void DBImpl::MaybeScheduleFlushOrCompaction() { mutex_.AssertHeld(); TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Start"); @@ -2895,6 +3082,9 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { if (bg_compaction_paused_ > 0) { // we paused the background compaction return; + } else if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + // we are aborting all compactions + return; } else if (error_handler_.IsBGWorkStopped()) { // Compaction is not part of the recovery sequence from a hard error. We // might get here because recovery might do a flush and install a new @@ -3404,6 +3594,10 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, bool made_progress = false; JobContext job_context(next_job_id_.fetch_add(1), true); TEST_SYNC_POINT("BackgroundCallCompaction:0"); + if (bg_thread_pri == Env::Priority::BOTTOM) { + TEST_SYNC_POINT("BackgroundCallCompaction:0:BottomPri"); + } + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log.get()); { @@ -3428,7 +3622,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, 10000); // prevent hot loop mutex_.Lock(); } else if (!s.ok() && !s.IsShutdownInProgress() && - !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) { + !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped() && + !s.IsCompactionAborted()) { // Wait a little bit before retrying background compaction in // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of @@ -3460,6 +3655,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, // case of a failure). Thus, we force full scan in FindObsoleteFiles() FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && !s.IsManualCompactionPaused() && + !s.IsCompactionAborted() && !s.IsColumnFamilyDropped() && !s.IsBusy()); TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); @@ -3564,6 +3760,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, if (!error_handler_.IsBGWorkStopped()) { if (shutting_down_.load(std::memory_order_acquire)) { status = Status::ShutdownInProgress(); + } else if (compaction_aborted_.load(std::memory_order_acquire) > 0) { + status = Status::Incomplete(Status::SubCode::kCompactionAborted); } else if (is_manual && manual_compaction->canceled.load(std::memory_order_acquire)) { status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); @@ -3639,34 +3837,54 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, : m->manual_end->DebugString(true).c_str())); } } - } else if (!is_prepicked && !compaction_queue_.empty()) { + } else if (ShouldPickCompaction(is_prepicked, prepicked_compaction)) { + bool need_repick = is_prepicked && prepicked_compaction->need_repick; if (HasExclusiveManualCompaction()) { - // Can't compact right now, but try again later TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict"); - // Stay in the compaction queue. - unscheduled_compactions_++; + // TODO(hx235): Resolve conflict between intended + // bottom-priority compaction (requiring repick, i.e., need_repick = true) + // and exclusive manual compaction by releasing the intended + // bottom-priority compaction. + if (!need_repick) { + // Can't compact right now, but try again later + // + // Increase `unscheduled_compactions_` directly so we + // don't need to + // dequeue and enqueue the CFD again in the compaction queue and thus + // keep the CFD's position in the queue + unscheduled_compactions_++; - return Status::OK(); + return Status::OK(); + } } - auto cfd = PickCompactionFromQueue(&task_token, log_buffer); - if (cfd == nullptr) { - // Can't find any executable task from the compaction queue. - // All tasks have been throttled by compaction thread limiter. - ++unscheduled_compactions_; - return Status::Busy(); - } + ColumnFamilyData* cfd = nullptr; + + if (!need_repick) { + cfd = PickCompactionFromQueue(&task_token, log_buffer); + if (cfd == nullptr) { + // Can't find any executable task from the compaction queue. + // All tasks have been throttled by compaction thread limiter. + ++unscheduled_compactions_; + return Status::Busy(); + } - // We unreference here because the following code will take a Ref() on - // this cfd if it is going to use it (Compaction class holds a - // reference). - // This will all happen under a mutex so we don't have to be afraid of - // somebody else deleting it. - if (cfd->UnrefAndTryDelete()) { - // This was the last reference of the column family, so no need to - // compact. - return Status::OK(); + // We unreference here because the following code will take a Ref() on + // this cfd if it is going to use it (Compaction class holds a + // reference). + // This will all happen under a mutex so we don't have to be afraid of + // somebody else deleting it. + if (cfd->UnrefAndTryDelete()) { + // This was the last reference of the column family, so no need to + // compact. + return Status::OK(); + } + } else { + cfd = c->column_family_data(); + assert(cfd); + ResetBottomPriCompactionIntent(cfd, c); + assert(c == nullptr); } // Pick up latest mutable CF Options and use it throughout the @@ -3680,21 +3898,24 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // compaction is not necessary. Need to make sure mutex is held // until we make a copy in the following code TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); - SnapshotChecker* snapshot_checker = nullptr; - std::vector snapshot_seqs; // This info is not useful for other scenarios, so save querying existing // snapshots for those cases. if (cfd->ioptions().compaction_style == kCompactionStyleUniversal && cfd->user_comparator()->timestamp_size() == 0) { - SequenceNumber earliest_write_conflict_snapshot; - GetSnapshotContext(job_context, &snapshot_seqs, - &earliest_write_conflict_snapshot, - &snapshot_checker); + InitSnapshotContext(job_context); assert(is_snapshot_supported_ || snapshots_.empty()); } - c.reset(cfd->PickCompaction(mutable_cf_options, mutable_db_options_, - snapshot_seqs, snapshot_checker, log_buffer)); - TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); + c.reset(cfd->PickCompaction( + mutable_cf_options, mutable_db_options_, job_context->snapshot_seqs, + job_context->snapshot_checker, log_buffer, + thread_pri == Env::Priority::BOTTOM /* require_max_output_level */)); + if (thread_pri == Env::Priority::LOW) { + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); + } else if (thread_pri == Env::Priority::BOTTOM) { + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri", + c.get()); + } if (c != nullptr) { bool enough_room = EnoughRoomForCompaction( @@ -3707,8 +3928,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ->current() ->storage_info() ->ComputeCompactionScore(c->immutable_options(), - c->mutable_cf_options()); - AddToCompactionQueue(cfd); + c->mutable_cf_options(), + cfd->GetFullHistoryTsLow()); + EnqueuePendingCompaction(cfd); c.reset(); // Don't need to sleep here, because BackgroundCallCompaction @@ -3730,16 +3952,21 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // options take effect. // 3) When we Pick a new compaction, we "remove" those files being // compacted from the calculation, which then influences compaction - // score. Here we check if we need the new compaction even without the - // files that are currently being compacted. If we need another - // compaction, we might be able to execute it in parallel, so we add - // it to the queue and schedule a new thread. - if (cfd->NeedsCompaction()) { - // Yes, we need more compactions! - AddToCompactionQueue(cfd); - MaybeScheduleFlushOrCompaction(); - } + // score. Inside EnqueuePendingCompaction(), we check if we need + // the new compaction even without the files that are currently being + // compacted. If we need another compaction, we might be able to + // execute it in parallel, so we add it to the queue and schedule a + // new thread. + EnqueuePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); } + } else if (is_prepicked) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Pre-picked compaction repicked files for compaction as " + "required, " + "but upon re-evaluation, no compaction was found necessary \n", + cfd->GetName().c_str()); } } } @@ -3781,11 +4008,253 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, c->column_family_data()->GetName().c_str(), c->num_input_files(0)); if (status.ok() && io_s.ok()) { - UpdateDeletionCompactionStats(c); + UpdateFIFOCompactionStatus(c); } *made_progress = true; TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", c->column_family_data()); + } else if (c->is_trivial_copy_compaction()) { + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:TriviaCopyBeforeCompaction", + c->column_family_data()); + assert(c->num_input_files(1) == 0); + assert(c->column_family_data()->ioptions().compaction_style == + kCompactionStyleFIFO); + assert(c->compaction_reason() == CompactionReason::kChangeTemperature); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + std::vector out_files; + for (const auto& in_file : *c->inputs(0)) { + const uint64_t out_file_number = versions_->NewFileNumber(); + const std::string in_fname = + TableFileName(c->immutable_options().cf_paths, + in_file->fd.GetNumber(), in_file->fd.GetPathId()); + const std::string out_fname = + TableFileName(c->immutable_options().cf_paths, out_file_number, + c->output_path_id()); + + // TODO (mikechuang): Currently skip calling + // EventHelpers::NotifyTableFileCreationStarted for the trivial copy. + // Since it's a trivial copy we should ideally use the exact + // TableProperties from the input file but that will break some existing + // stress tests. For now skip the listener call for the FIFO + // kChangeTemperature trivial copy move. + + int64_t tmp_current_time = 0; + auto get_time_status = + immutable_db_options_.clock->GetCurrentTime(&tmp_current_time); + if (!get_time_status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] WARNING: Failed to get current time %s " + "status=%s", + c->column_family_data()->GetName().c_str(), + get_time_status.ToString().c_str()); + } + uint64_t out_file_creation_time = static_cast(tmp_current_time); + + FileOptions copied_file_options = file_options_; + copied_file_options.temperature = c->GetOutputTemperature(); + std::unique_ptr dest_writer; + { + std::unique_ptr dest_file; + IOStatus writable_file_io_status = + immutable_db_options_.fs.get()->NewWritableFile( + out_fname, copied_file_options, &dest_file, nullptr /* dbg */); + TEST_SYNC_POINT_CALLBACK( + "NewWritableFile::FileOptions.temperature", + const_cast(&copied_file_options.temperature)); + if (!writable_file_io_status.ok()) { + io_s = writable_file_io_status; + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Error: Abort trivial copy compaction, failed to open " + "NewWritableFile %s\n" + " out_fname=%s, temperature=%s, io_status=%s", + c->column_family_data()->GetName().c_str(), out_fname.c_str(), + temperature_to_string[c->GetOutputTemperature()].c_str(), + io_s.ToString().c_str()); + break; + } + + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; + dest_writer.reset(new WritableFileWriter( + std::move(dest_file), out_fname, copied_file_options, + immutable_db_options_.clock, io_tracer_, + immutable_db_options_.stats, Histograms::SST_WRITE_MICROS, + c->immutable_options().listeners, + immutable_db_options_.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + } + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Started copying from: %s\n" + " temperature=%s, to: %s, temperature=%s, buffer_size=%" PRIu64, + c->column_family_data()->GetName().c_str(), in_fname.c_str(), + temperature_to_string[in_file->temperature].c_str(), + out_fname.c_str(), + temperature_to_string[c->GetOutputTemperature()].c_str(), + c->mutable_cf_options() + .compaction_options_fifo.trivial_copy_buffer_size); + // Add IO_LOW HINT for compaction + IOOptions copy_files_compaction_io_options; + copy_files_compaction_io_options.rate_limiter_priority = + Env::IOPriority::IO_LOW; + copy_files_compaction_io_options.type = IOType::kData; + copy_files_compaction_io_options.io_activity = + Env::IOActivity::kCompaction; + + IOStatus copy_file_io_status = CopyFile( + immutable_db_options_.fs.get() /* fileSystem */, + in_fname /* source */, in_file->temperature /* src_temp_hint */, + dest_writer /* dest_writer */, 0 /* size */, true /* use_fsync */, + io_tracer_ /* io_tracer*/, + c->mutable_cf_options() + .compaction_options_fifo + .trivial_copy_buffer_size /* max_read_buffer_size + */ + , + copy_files_compaction_io_options /* readIOOptions */, + copy_files_compaction_io_options /* writeIOOptions */); + if (dest_writer) { + IOOptions close_files_compaction_io_options; + close_files_compaction_io_options.rate_limiter_priority = + Env::IOPriority::IO_LOW; + close_files_compaction_io_options.type = IOType::kData; + close_files_compaction_io_options.io_activity = + Env::IOActivity::kCompaction; + // Close the dest_write + io_s = dest_writer->Close(close_files_compaction_io_options); + if (!io_s.ok()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Failed to close the writer. Failed to copy from: %s\n" + " temperature=%s, to=%s, temperature=%s, io_status=%s", + c->column_family_data()->GetName().c_str(), in_fname.c_str(), + temperature_to_string[in_file->temperature].c_str(), + out_fname.c_str(), + temperature_to_string[c->GetOutputTemperature()].c_str(), + io_s.ToString().c_str()); + break; + } + } + + io_s = copy_file_io_status; + + if (!io_s.ok()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Failed to copy from: %s\n" + " temperature=%s, to=%s, temperature=%s, io_status=%s", + c->column_family_data()->GetName().c_str(), in_fname.c_str(), + temperature_to_string[in_file->temperature].c_str(), + out_fname.c_str(), + temperature_to_string[c->GetOutputTemperature()].c_str(), + io_s.ToString().c_str()); + break; + } + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Successfully copying from: %s\n" + " temperature=%s, to=%s, temperature=%s, io_status=%s", + c->column_family_data()->GetName().c_str(), + in_fname.c_str(), + temperature_to_string[in_file->temperature].c_str(), + out_fname.c_str(), + temperature_to_string[c->GetOutputTemperature()].c_str(), + io_s.ToString().c_str()); + + FileMetaData out_file_metadata{ + out_file_number, + c->output_path_id(), + in_file->fd.GetFileSize(), + in_file->smallest, + in_file->largest, + in_file->fd.smallest_seqno, + in_file->fd.largest_seqno, + false /* marked_for_compact */, + c->GetOutputTemperature() /* temperature */, + in_file->oldest_blob_file_number, + in_file->oldest_ancester_time, + out_file_creation_time, + c->MinInputFileEpochNumber(), + dest_writer->GetFileChecksum(), + dest_writer->GetFileChecksumFuncName(), + in_file->unique_id, + in_file->compensated_range_deletion_size, + in_file->tail_size, + in_file->user_defined_timestamps_persisted, + in_file->min_timestamp, + in_file->max_timestamp}; + + out_files.push_back(std::move(out_file_metadata)); + } + + // Update version set + if (status.ok() && io_s.ok()) { + // NOTE: ChangeTemperature should only copy one file at one file + // hence *c->inputs(0) == out_files.size() == 1 if copy succeeded + assert(c->inputs(0)->size() == 1); + assert(out_files.size() == 1); + + auto out_file_metadata_it = out_files.begin(); + for (const auto& in_file : *c->inputs(0)) { + if (out_file_metadata_it == out_files.end()) { + break; + } + + c->edit()->DeleteFile(c->level(), in_file->fd.GetNumber()); + c->edit()->AddFile(c->level(), *out_file_metadata_it); + ++out_file_metadata_it; + } + + status = versions_->LogAndApply( + c->column_family_data(), read_options, write_options, c->edit(), + &mutex_, directories_.GetDbDir(), + /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, + [&c, &compaction_released](const Status& s) { + c->ReleaseCompactionFiles(s); + compaction_released = true; + }); + } + + // TODO (mikechuang): Currently skip calling + // EventHelper::LogAndNotifyTableFileCreationFinished for the trivial copy. + // Since it's a trivial copy we should ideally use the exact TableProperties + // from the input file but that will break some existing stress tests. For + // now skip the listener call for the FIFO kChangeTemperature trivial copy + // move. + + if (io_s.ok()) { + io_s = versions_->io_status(); + } + + InstallSuperVersionAndScheduleWork( + c->column_family_data(), job_context->superversion_contexts.data()); + if (status.ok() && io_s.ok()) { + UpdateFIFOCompactionStatus(c); + } else { + for (const auto& in_file : *c->inputs(0)) { + const std::string in_fname = + TableFileName(c->immutable_options().cf_paths, + in_file->fd.GetNumber(), in_file->fd.GetPathId()); + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Failed to do trvial copy compaction: %s" + " temperature=%s, to temperature=%s, status=%s, io_status=%s", + c->column_family_data()->GetName().c_str(), in_fname.c_str(), + temperature_to_string[in_file->temperature].c_str(), + temperature_to_string[c->GetOutputTemperature()].c_str(), + status.ToString().c_str(), io_s.ToString().c_str()); + } + } + *made_progress = true; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:TriviaCopyAfterCompaction", + c->column_family_data()); } else if (!trivial_move_disallowed && c->IsTrivialMove()) { TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", @@ -3798,39 +4267,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, compaction_job_stats.num_input_files = c->num_input_files(0); // Trivial moves do not get compacted remotely compaction_job_stats.is_remote_compaction = false; + compaction_job_stats.num_input_files_trivially_moved = + compaction_job_stats.num_input_files; NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); - // Move files to next level - int32_t moved_files = 0; - int64_t moved_bytes = 0; - for (unsigned int l = 0; l < c->num_input_levels(); l++) { - if (c->level(l) == c->output_level()) { - continue; - } - for (size_t i = 0; i < c->num_input_files(l); i++) { - FileMetaData* f = c->input(l, i); - c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); - c->edit()->AddFile( - c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(), - f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, - f->fd.largest_seqno, f->marked_for_compaction, f->temperature, - f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->epoch_number, f->file_checksum, - f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size, f->tail_size, - f->user_defined_timestamps_persisted); - - ROCKS_LOG_BUFFER( - log_buffer, - "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", - c->column_family_data()->GetName().c_str(), f->fd.GetNumber(), - c->output_level(), f->fd.GetFileSize()); - ++moved_files; - moved_bytes += f->fd.GetFileSize(); - } - } if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize && c->immutable_options().compaction_pri == kRoundRobin) { int start_level = c->start_level(); @@ -3841,14 +4283,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, vstorage->GetNextCompactCursor(start_level, c->num_input_files(0))); } } - status = versions_->LogAndApply( - c->column_family_data(), read_options, write_options, c->edit(), - &mutex_, directories_.GetDbDir(), - /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, - [&c, &compaction_released](const Status& s) { - c->ReleaseCompactionFiles(s); - compaction_released = true; - }); + + // Perform the trivial move + size_t moved_files = 0; + size_t moved_bytes = 0; + status = PerformTrivialMove(*c.get(), log_buffer, compaction_released, + moved_files, moved_bytes); io_s = versions_->io_status(); InstallSuperVersionAndScheduleWork( c->column_family_data(), job_context->superversion_contexts.data()); @@ -3863,8 +4303,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, << "total_files_size" << moved_bytes; } ROCKS_LOG_BUFFER( - log_buffer, - "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", + log_buffer, "[%s] Moved #%d files to level-%zu %zu bytes %s: %s\n", c->column_family_data()->GetName().c_str(), moved_files, c->output_level(), moved_bytes, status.ToString().c_str(), c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); @@ -3874,14 +4313,17 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ThreadStatusUtil::ResetThreadStatus(); TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", c->column_family_data()); - } else if (!is_prepicked && c->output_level() > 0 && - c->output_level() == + } else if (!is_prepicked && + Compaction::OutputToNonZeroMaxOutputLevel( + c->output_level(), c->column_family_data() ->current() ->storage_info() ->MaxOutputLevel( - immutable_db_options_.allow_ingest_behind) && + c->immutable_options().cf_allow_ingest_behind || + immutable_db_options_.allow_ingest_behind)) && env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { + assert(thread_pri == Env::Priority::LOW); // Forward compactions involving last level to the bottom pool if it exists, // such that compactions unlikely to contribute to write stalls can be // delayed or deprioritized. @@ -3890,7 +4332,23 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ca->db = this; ca->compaction_pri_ = Env::Priority::BOTTOM; ca->prepicked_compaction = new PrepickedCompaction; - ca->prepicked_compaction->compaction = c.release(); + + // If `universal_reduce_file_locking` is true, we only lock a limited set of + // input files by creating an intended compaction to forward to bottom + // priority pool and repicking files when bottom priority thread + // gets to execute this intended compaction + const bool need_repick = + c->mutable_cf_options() + .compaction_options_universal.reduce_file_locking; + if (need_repick) { + ca->prepicked_compaction->compaction = + CreateIntendedCompactionForwardedToBottomPriorityPool(c.get()); + c.reset(); + ca->prepicked_compaction->need_repick = true; + } else { + ca->prepicked_compaction->compaction = c.release(); + ca->prepicked_compaction->need_repick = false; + } ca->prepicked_compaction->manual_compaction_state = nullptr; // Transfer requested token, so it doesn't need to do it again. ca->prepicked_compaction->task_token = std::move(task_token); @@ -3905,11 +4363,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, output_level = c->output_level(); TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", &output_level); - std::vector snapshot_seqs; - SequenceNumber earliest_write_conflict_snapshot; - SnapshotChecker* snapshot_checker; - GetSnapshotContext(job_context, &snapshot_seqs, - &earliest_write_conflict_snapshot, &snapshot_checker); + InitSnapshotContext(job_context); assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJob compaction_job( @@ -3918,15 +4372,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, &shutting_down_, log_buffer, directories_.GetDbDir(), GetDataDir(c->column_family_data(), c->output_path_id()), GetDataDir(c->column_family_data(), 0), stats_, &mutex_, - &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, job_context, table_cache_, &event_logger_, + &error_handler_, job_context, table_cache_, &event_logger_, c->mutable_cf_options().paranoid_file_checks, c->mutable_cf_options().report_bg_io_stats, dbname_, &compaction_job_stats, thread_pri, io_tracer_, is_manual ? manual_compaction->canceled : kManualCompactionCanceledFalse_, - db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(), - c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_, + compaction_aborted_, db_id_, db_session_id_, + c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(), + &blob_callback_, &bg_compaction_scheduled_, &bg_bottom_compaction_scheduled_); compaction_job.Prepare(std::nullopt /*subcompact to be computed*/); @@ -3939,8 +4393,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); mutex_.Unlock(); - TEST_SYNC_POINT_CALLBACK( - "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); + if (thread_pri == Env::Priority::LOW) { + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); + } else { + assert(thread_pri == Env::Priority::BOTTOM); + TEST_SYNC_POINT( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri"); + } + // Should handle error? compaction_job.Run().PermitUncheckedError(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); @@ -4002,7 +4463,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } if (status.ok() || status.IsCompactionTooLarge() || - status.IsManualCompactionPaused()) { + status.IsManualCompactionPaused() || status.IsCompactionAborted()) { // Done } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down @@ -4033,10 +4494,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ->current() ->storage_info() ->ComputeCompactionScore(c->immutable_options(), - c->mutable_cf_options()); - if (!cfd->queued_for_compaction()) { - AddToCompactionQueue(cfd); - } + c->mutable_cf_options(), + cfd->GetFullHistoryTsLow()); + EnqueuePendingCompaction(cfd); } } // this will unref its input_version and column_family_data @@ -4081,6 +4541,72 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, return status; } +// Create an intended compaction to forward based on the original picked +// compaction. It serves two purposes while it is waiting +// for a bottom-priority thread becomes available to run: +// - Prevent the last input file (or sorted run if non-L0) from +// being included in compaction score calculations unnecessarily since the +// intended compaction is already scheduled to compact it +// - Allow other input files to be picked by low-priority compactions that can +// run right away +// +// Once a bottom-priority available to run this intended compaction, it will +// repick files to consider the LSM updates that occurred during the waiting +// period. +Compaction* DBImpl::CreateIntendedCompactionForwardedToBottomPriorityPool( + Compaction* c) { + auto* cfd = c->column_family_data(); + const auto& io = c->immutable_options(); + const auto& mo = c->mutable_cf_options(); + auto* vstorage = c->input_version()->storage_info(); + + std::vector inputs(1); + + const std::vector* max_intput_level_files = nullptr; + int max_intput_level = 0; + + for (size_t i = c->num_input_levels(); i >= 1; --i) { + size_t level = i - 1; + if (c->num_input_files(level) > 0) { + max_intput_level = static_cast(level); + max_intput_level_files = c->inputs(level); + break; + } + } + + assert(max_intput_level_files); + assert(!max_intput_level_files->empty()); + inputs[0].level = max_intput_level; + + if (max_intput_level == 0) { + // The last input file + inputs[0].files.push_back( + (*max_intput_level_files)[max_intput_level_files->size() - 1]); + } else { + // The last input sorted run + for (FileMetaData* f : (*max_intput_level_files)) { + inputs[0].files.push_back(f); + } + } + + c->ReleaseCompactionFiles(Status::OK()); + + Compaction* intended_compaction = + new Compaction(vstorage, io, mo, mutable_db_options_, std::move(inputs), + c->output_level(), c->target_output_file_size(), + c->max_compaction_bytes(), c->output_path_id(), + c->output_compression(), c->output_compression_opts(), + c->GetOutputTemperature(), c->max_subcompactions(), + c->grandparents(), std::nullopt /* earliest_snapshot */, + nullptr /* snapshot_checker */, c->compaction_reason()); + + cfd->compaction_picker()->RegisterCompaction(intended_compaction); + vstorage->ComputeCompactionScore(io, mo, cfd->GetFullHistoryTsLow()); + intended_compaction->FinalizeInputInfo(cfd->current()); + + return intended_compaction; +} + bool DBImpl::HasPendingManualCompaction() { return (!manual_compaction_dequeue_.empty()); } @@ -4169,8 +4695,7 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) { return false; } -void DBImpl::UpdateDeletionCompactionStats( - const std::unique_ptr& c) { +void DBImpl::UpdateFIFOCompactionStatus(const std::unique_ptr& c) { if (c == nullptr) { return; } @@ -4184,6 +4709,9 @@ void DBImpl::UpdateDeletionCompactionStats( case CompactionReason::kFIFOTtl: RecordTick(stats_, FIFO_TTL_COMPACTIONS); break; + case CompactionReason::kChangeTemperature: + RecordTick(stats_, FIFO_CHANGE_TEMPERATURE_COMPACTIONS); + break; default: assert(false); break; @@ -4198,6 +4726,7 @@ void DBImpl::BuildCompactionJobInfo( compaction_job_info->cf_id = cfd->GetID(); compaction_job_info->cf_name = cfd->GetName(); compaction_job_info->status = st; + compaction_job_info->aborted = st.IsCompactionAborted(); compaction_job_info->thread_id = env_->GetThreadID(); compaction_job_info->job_id = job_id; compaction_job_info->base_input_level = c->start_level(); @@ -4273,9 +4802,10 @@ void DBImpl::BuildCompactionJobInfo( // for superversion_to_free void DBImpl::InstallSuperVersionAndScheduleWork( - ColumnFamilyData* cfd, SuperVersionContext* sv_context) { + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + std::optional> + new_seqno_to_time_mapping) { mutex_.AssertHeld(); - const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions(); // Update max_total_in_memory_state_ size_t old_memtable_size = 0; @@ -4289,7 +4819,8 @@ void DBImpl::InstallSuperVersionAndScheduleWork( if (UNLIKELY(sv_context->new_superversion == nullptr)) { sv_context->NewSuperVersion(); } - cfd->InstallSuperVersion(sv_context, mutable_cf_options); + cfd->InstallSuperVersion(sv_context, &mutex_, + std::move(new_seqno_to_time_mapping)); // There may be a small data race here. The snapshot tricking bottommost // compaction may already be released here. But assuming there will always be @@ -4298,7 +4829,7 @@ void DBImpl::InstallSuperVersionAndScheduleWork( bottommost_files_mark_threshold_ = kMaxSequenceNumber; standalone_range_deletion_files_mark_threshold_ = kMaxSequenceNumber; for (auto* my_cfd : *versions_->GetColumnFamilySet()) { - if (!my_cfd->ioptions().allow_ingest_behind) { + if (!my_cfd->AllowIngestBehind()) { bottommost_files_mark_threshold_ = std::min( bottommost_files_mark_threshold_, my_cfd->current()->storage_info()->bottommost_files_mark_threshold()); @@ -4316,9 +4847,10 @@ void DBImpl::InstallSuperVersionAndScheduleWork( MaybeScheduleFlushOrCompaction(); // Update max_total_in_memory_state_ - max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size + - mutable_cf_options.write_buffer_size * - mutable_cf_options.max_write_buffer_number; + max_total_in_memory_state_ = + max_total_in_memory_state_ - old_memtable_size + + cfd->GetLatestMutableCFOptions().write_buffer_size * + cfd->GetLatestMutableCFOptions().max_write_buffer_number; } // ShouldPurge is called by FindObsoleteFiles when doing a full scan, @@ -4347,31 +4879,33 @@ void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) { snapshot_checker_.reset(snapshot_checker); } -void DBImpl::GetSnapshotContext( - JobContext* job_context, std::vector* snapshot_seqs, - SequenceNumber* earliest_write_conflict_snapshot, - SnapshotChecker** snapshot_checker_ptr) { +void DBImpl::InitSnapshotContext(JobContext* job_context) { mutex_.AssertHeld(); assert(job_context != nullptr); - assert(snapshot_seqs != nullptr); - assert(earliest_write_conflict_snapshot != nullptr); - assert(snapshot_checker_ptr != nullptr); - - *snapshot_checker_ptr = snapshot_checker_.get(); - if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) { - *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance(); + if (job_context->snapshot_context_initialized) { + return; + } + SnapshotChecker* snapshot_checker = snapshot_checker_.get(); + if (use_custom_gc_ && !snapshot_checker) { + snapshot_checker = DisableGCSnapshotChecker::Instance(); } - if (*snapshot_checker_ptr != nullptr) { + std::unique_ptr managed_snapshot = nullptr; + if (snapshot_checker) { // If snapshot_checker is used, that means the flush/compaction may // contain values not visible to snapshot taken after // flush/compaction job starts. Take a snapshot and it will appear // in snapshot_seqs and force compaction iterator to consider such // snapshots. - const Snapshot* job_snapshot = - GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/); - job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot)); - } - *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot); + const Snapshot* snapshot = + GetSnapshotImpl(/*is_write_conflict_boundary=*/false, /*lock=*/false); + managed_snapshot.reset(new ManagedSnapshot(this, snapshot)); + } + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + job_context->InitSnapshotContext( + snapshot_checker, std::move(managed_snapshot), + earliest_write_conflict_snapshot, std::move(snapshot_seqs)); } Status DBImpl::WaitForCompact( @@ -4430,4 +4964,19 @@ Status DBImpl::WaitForCompact( } } +bool DBImpl::ShouldPickCompaction( + bool is_prepicked, const PrepickedCompaction* prepicked_compaction) { + return (!is_prepicked && !compaction_queue_.empty()) || + (is_prepicked && prepicked_compaction->need_repick); +} + +void DBImpl::ResetBottomPriCompactionIntent(ColumnFamilyData* cfd, + std::unique_ptr& c) { + c->ReleaseCompactionFiles(Status::OK()); + cfd->current()->storage_info()->ComputeCompactionScore( + c->immutable_options(), c->mutable_cf_options(), + cfd->GetFullHistoryTsLow()); + c.reset(); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 38873b0e3212..138527bb782e 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -84,6 +84,7 @@ void DBImpl::TEST_GetFilesMetaData( } uint64_t DBImpl::TEST_Current_Manifest_FileNo() { + InstrumentedMutexLock l(&mutex_); return versions_->manifest_file_number(); } @@ -224,13 +225,13 @@ void DBImpl::TEST_EndWrite(void* w) { } size_t DBImpl::TEST_LogsToFreeSize() { - InstrumentedMutexLock l(&log_write_mutex_); - return logs_to_free_.size(); + InstrumentedMutexLock l(&wal_write_mutex_); + return wals_to_free_.size(); } uint64_t DBImpl::TEST_LogfileNumber() { InstrumentedMutexLock l(&mutex_); - return logfile_number_; + return cur_wal_number_; } void DBImpl::TEST_GetAllBlockCaches( @@ -379,10 +380,13 @@ void DBImpl::TEST_VerifyNoObsoleteFilesCached( uint64_t file_number; GetUnaligned(reinterpret_cast(key.data()), &file_number); // Assert file is in live/quarantined set - if (live_and_quar_files.find(file_number) == live_and_quar_files.end()) { + bool cached_file_is_live_or_quar = + live_and_quar_files.find(file_number) != live_and_quar_files.end(); + if (!cached_file_is_live_or_quar) { + // Fail with useful info std::cerr << "File " << file_number << " is not live nor quarantined" << std::endl; - assert(false); + assert(cached_file_is_live_or_quar); } }; table_cache_->ApplyToAllEntries(fn, {}); diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 49d583e6623d..bb6a9a2e409c 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -46,7 +46,8 @@ Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family, // Since we have some more files to compact, we should also recompute // compaction score vstorage->ComputeCompactionScore(cfd->ioptions(), - cfd->GetLatestMutableCFOptions()); + cfd->GetLatestMutableCFOptions(), + cfd->GetFullHistoryTsLow()); EnqueuePendingCompaction(cfd); MaybeScheduleFlushOrCompaction(); } @@ -143,7 +144,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, f->tail_size, - f->user_defined_timestamps_persisted); + f->user_defined_timestamps_persisted, f->min_timestamp, + f->max_timestamp); } status = versions_->LogAndApply(cfd, read_options, write_options, &edit, diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index c1ef7b96b160..d9d56a1f447b 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -28,7 +28,7 @@ uint64_t DBImpl::MinLogNumberToKeep() { return versions_->min_log_number_to_keep(); } -uint64_t DBImpl::MinLogNumberToRecycle() { return min_log_number_to_recycle_; } +uint64_t DBImpl::MinLogNumberToRecycle() { return min_wal_number_to_recycle_; } uint64_t DBImpl::MinObsoleteSstNumberToKeep() { mutex_.AssertHeld(); @@ -267,82 +267,85 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, if (!job_context->HaveSomethingToDelete()) { mutex_.AssertHeld(); --pending_purge_obsolete_files_; + if (pending_purge_obsolete_files_ == 0) { + bg_cv_.SignalAll(); + } } }); // logs_ is empty when called during recovery, in which case there can't yet // be any tracked obsolete logs - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); - if (alive_log_files_.empty() || logs_.empty()) { + if (alive_wal_files_.empty() || logs_.empty()) { mutex_.AssertHeld(); // We may reach here if the db is DBImplSecondary - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); return; } bool mutex_unlocked = false; - if (!alive_log_files_.empty() && !logs_.empty()) { + if (!alive_wal_files_.empty() && !logs_.empty()) { uint64_t min_log_number = job_context->log_number; - size_t num_alive_log_files = alive_log_files_.size(); + size_t num_alive_wal_files = alive_wal_files_.size(); // find newly obsoleted log files - while (alive_log_files_.begin()->number < min_log_number) { - auto& earliest = *alive_log_files_.begin(); + while (alive_wal_files_.begin()->number < min_log_number) { + auto& earliest = *alive_wal_files_.begin(); if (immutable_db_options_.recycle_log_file_num > - log_recycle_files_.size() && + wal_recycle_files_.size() && earliest.number >= MinLogNumberToRecycle()) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "adding log %" PRIu64 " to recycle list\n", earliest.number); - log_recycle_files_.push_back(earliest.number); + wal_recycle_files_.push_back(earliest.number); } else { job_context->log_delete_files.push_back(earliest.number); } if (job_context->size_log_to_delete == 0) { - job_context->prev_total_log_size = total_log_size_; - job_context->num_alive_log_files = num_alive_log_files; + job_context->prev_wals_total_size = wals_total_size_.LoadRelaxed(); + job_context->num_alive_wal_files = num_alive_wal_files; } job_context->size_log_to_delete += earliest.size; - total_log_size_ -= earliest.size; - alive_log_files_.pop_front(); + wals_total_size_.FetchSubRelaxed(earliest.size); + alive_wal_files_.pop_front(); // Current log should always stay alive since it can't have // number < MinLogNumber(). - assert(alive_log_files_.size()); + assert(alive_wal_files_.size()); } - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); mutex_.Unlock(); mutex_unlocked = true; TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr); - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); while (!logs_.empty() && logs_.front().number < min_log_number) { auto& log = logs_.front(); if (log.IsSyncing()) { - log_sync_cv_.Wait(); + wal_sync_cv_.Wait(); // logs_ could have changed while we were waiting. continue; } // This WAL file is not live, so it's OK if we never sync the rest of it. // If it's already closed, then it's been fully synced. If // !background_close_inactive_wals then we need to Close it before - // removing from logs_ but not blocking while holding log_write_mutex_. + // removing from logs_ but not blocking while holding wal_write_mutex_. if (!immutable_db_options_.background_close_inactive_wals && log.writer->file()) { // We are taking ownership of and pinning the front entry, so we can // expect it to be the same after releasing and re-acquiring the lock log.PrepareForSync(); - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); // TODO: maybe check the return value of Close. // TODO: plumb Env::IOActivity, Env::IOPriority auto s = log.writer->file()->Close({}); s.PermitUncheckedError(); - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); log.writer->PublishIfClosed(); assert(&log == &logs_.front()); log.FinishSync(); - log_sync_cv_.SignalAll(); + wal_sync_cv_.SignalAll(); } - logs_to_free_.push_back(log.ReleaseWriter()); + wals_to_free_.push_back(log.ReleaseWriter()); logs_.pop_front(); } // Current log cannot be obsolete. @@ -350,16 +353,16 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } // We're just cleaning up for DB::Write(). - assert(job_context->logs_to_free.empty()); - job_context->logs_to_free = logs_to_free_; + assert(job_context->wals_to_free.empty()); + job_context->wals_to_free = wals_to_free_; - logs_to_free_.clear(); - log_write_mutex_.Unlock(); + wals_to_free_.clear(); + wal_write_mutex_.Unlock(); if (mutex_unlocked) { mutex_.Lock(); } - job_context->log_recycle_files.assign(log_recycle_files_.begin(), - log_recycle_files_.end()); + job_context->log_recycle_files.assign(wal_recycle_files_.begin(), + wal_recycle_files_.end()); } // Delete obsolete files and log status and information of file deletion @@ -368,6 +371,7 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, FileType type, uint64_t number) { TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", const_cast(&fname)); + IGNORE_STATUS_IF_ERROR(Status::IOError()); Status file_deletion_status; if (type == kTableFile || type == kBlobFile || type == kWalFile) { @@ -423,12 +427,14 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // FindObsoleteFiles() should've populated this so nonzero assert(state.manifest_file_number != 0); + IGNORE_STATUS_IF_ERROR(Status::IOError()); + // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow. std::unordered_set sst_live_set(state.sst_live.begin(), state.sst_live.end()); std::unordered_set blob_live_set(state.blob_live.begin(), state.blob_live.end()); - std::unordered_set log_recycle_files_set( + std::unordered_set wal_recycle_files_set( state.log_recycle_files.begin(), state.log_recycle_files.end()); std::unordered_set quarantine_files_set( state.files_to_quarantine.begin(), state.files_to_quarantine.end()); @@ -488,13 +494,13 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { std::unique(candidate_files.begin(), candidate_files.end()), candidate_files.end()); - if (state.prev_total_log_size > 0) { + if (state.prev_wals_total_size > 0) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "[JOB %d] Try to delete WAL files size %" PRIu64 ", prev total WAL file size %" PRIu64 ", number of live WAL files %" ROCKSDB_PRIszt ".\n", state.job_id, state.size_log_to_delete, - state.prev_total_log_size, state.num_alive_log_files); + state.prev_wals_total_size, state.num_alive_wal_files); } std::vector old_info_log_files; @@ -529,7 +535,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { optsfile_num2 = std::min(optsfile_num2, state.min_options_file_number); // Close WALs before trying to delete them. - for (const auto w : state.logs_to_free) { + for (const auto w : state.wals_to_free) { // TODO: maybe check the return value of Close. // TODO: plumb Env::IOActivity, Env::IOPriority auto s = w->Close({}); @@ -556,8 +562,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { case kWalFile: keep = ((number >= state.log_number) || (number == state.prev_log_number) || - (log_recycle_files_set.find(number) != - log_recycle_files_set.end())); + (wal_recycle_files_set.find(number) != + wal_recycle_files_set.end())); break; case kDescriptorFile: // Keep my manifest file, and any newer incarnations' @@ -611,6 +617,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { case kOptionsFile: keep = (number >= optsfile_num2); break; + case kCompactionProgressFile: + // Keep compaction progress files - they are managed + // separately by DBImplSecondary for now + keep = true; + break; case kCurrentFile: case kDBLockFile: case kIdentityFile: diff --git a/db/db_impl/db_impl_follower.cc b/db/db_impl/db_impl_follower.cc index 90c4326ceb15..1262c5bdfdb6 100644 --- a/db/db_impl/db_impl_follower.cc +++ b/db/db_impl/db_impl_follower.cc @@ -70,9 +70,6 @@ Status DBImplFollower::Recover( } return s; } - if (immutable_db_options_.paranoid_checks && s.ok()) { - s = CheckConsistency(); - } if (s.ok()) { default_cf_handle_ = new ColumnFamilyHandleImpl( versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); @@ -296,9 +293,9 @@ Status DB::OpenAsFollower( DBImplFollower* impl = new DBImplFollower(tmp_opts, std::move(new_env), dbname, src_path); impl->versions_.reset(new ReactiveVersionSet( - dbname, &impl->immutable_db_options_, impl->file_options_, - impl->table_cache_.get(), impl->write_buffer_manager_, - &impl->write_controller_, impl->io_tracer_)); + dbname, &impl->immutable_db_options_, impl->mutable_db_options_, + impl->file_options_, impl->table_cache_.get(), + impl->write_buffer_manager_, &impl->write_controller_, impl->io_tracer_)); impl->column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 577a861dcca6..7b2e949789fc 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -35,8 +35,8 @@ Options SanitizeOptions(const std::string& dbname, const Options& src, auto db_options = SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s); ImmutableDBOptions immutable_db_options(db_options); - auto cf_options = - SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); + auto cf_options = SanitizeCfOptions(immutable_db_options, read_only, + ColumnFamilyOptions(src)); return Options(db_options, cf_options); } @@ -191,12 +191,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, "wal_compression is disabled since only zstd is supported"); } - if (!result.paranoid_checks) { - result.skip_checking_sst_file_sizes_on_db_open = true; - ROCKS_LOG_INFO(result.info_log, - "file size check will be skipped during open."); - } - return result; } @@ -224,6 +218,12 @@ Status DBImpl::ValidateOptions( if (!s.ok()) { return s; } + if (cfd.name == kDefaultColumnFamilyName) { + if (cfd.options.disallow_memtable_writes) { + return Status::InvalidArgument( + "Default column family cannot use disallow_memtable_writes=true"); + } + } } s = ValidateOptions(db_options); return s; @@ -329,7 +329,7 @@ Status DBImpl::NewDB(std::vector* new_filenames) { } FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; file->SetPreallocationBlockSize( - immutable_db_options_.manifest_preallocation_size); + mutable_db_options_.manifest_preallocation_size); std::unique_ptr file_writer(new WritableFileWriter( std::move(file), manifest, file_options, immutable_db_options_.clock, io_tracer_, nullptr /* stats */, @@ -599,7 +599,7 @@ Status DBImpl::Recover( // allow_ingest_behind does not support Level Compaction, // and per_key_placement can have infinite compaction loop for Level // Compaction. Adjust to_level here just to be safe. - if (cfd->ioptions().allow_ingest_behind || + if (cfd->AllowIngestBehind() || moptions.preclude_last_level_data_seconds > 0) { to_level -= 1; } @@ -657,7 +657,8 @@ Status DBImpl::Recover( f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, - f->tail_size, f->user_defined_timestamps_persisted); + f->tail_size, f->user_defined_timestamps_persisted, + f->min_timestamp, f->max_timestamp); ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] Moving #%" PRIu64 " from from_level-%d to from_level-%d %" PRIu64 @@ -688,9 +689,6 @@ Status DBImpl::Recover( s = MaybeUpdateNextFileNumber(recovery_ctx); } - if (immutable_db_options_.paranoid_checks && s.ok()) { - s = CheckConsistency(); - } if (s.ok() && !read_only) { // TODO: share file descriptors (FSDirectory) with SetDirectories above std::map> created_dirs; @@ -1113,7 +1111,7 @@ void DBOpenLogRecordReadReporter::Corruption(size_t bytes, const Status& s, static_cast(bytes), s.ToString().c_str()); if (status != nullptr && status->ok()) { *status = s; - corrupted_log_number_ = log_number; + corrupted_wal_number_ = log_number; } } @@ -1197,6 +1195,13 @@ Status DBImpl::ProcessLogFiles( PredecessorWALInfo predecessor_wal_info; for (auto wal_number : wal_numbers) { + // Detecting early break on the next iteration after `wal_number` has been + // advanced since this `wal_number` doesn't affect follow-up handling after + // breaking out of the for loop. + if (!status.ok()) { + break; + } + SequenceNumber prev_next_sequence = *next_sequence; if (status.ok()) { status = ProcessLogFile( wal_number, min_wal_number, is_retry, read_only, job_id, @@ -1204,6 +1209,10 @@ Status DBImpl::ProcessLogFiles( &stop_replay_by_wal_filter, &corrupted_wal_number, corrupted_wal_found, version_edits, &flushed, predecessor_wal_info); } + if (status.ok()) { + status = CheckSeqnoNotSetBackDuringRecovery(prev_next_sequence, + *next_sequence); + } } if (status.ok()) { @@ -1311,6 +1320,7 @@ Status DBImpl::ProcessLogFile( } // FIXME(hx235): consolidate `process_status` and `status` + SequenceNumber prev_next_sequence = *next_sequence; Status process_status = ProcessLogRecord( record, reader, running_ts_sz, wal_number, fname, read_only, job_id, logFileDropped, &reporter, &record_checksum, &last_seqno_observed, @@ -1319,6 +1329,12 @@ Status DBImpl::ProcessLogFile( if (!process_status.ok()) { return process_status; + } else if (Status seqno_check_status = CheckSeqnoNotSetBackDuringRecovery( + prev_next_sequence, *next_sequence); + !seqno_check_status.ok()) { + // Sequence number being set back indicates a serious software bug, the DB + // should not be opened in this case. + return seqno_check_status; } else if (*stop_replay_for_corruption) { break; } @@ -1740,8 +1756,12 @@ Status DBImpl::MaybeHandleStopReplayForCorruptionForInconsistency( ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Column family inconsistency: SST file contains data" " beyond the point of corruption."); - status = Status::Corruption("SST file is ahead of WALs in CF " + - cfd->GetName()); + status = Status::Corruption( + "Column family inconsistency: SST file contains data" + " beyond the point of corruption in CF " + + cfd->GetName() + + ". WAL recovery stopped at corruption point, but SST files" + " contain newer data."); return status; } } @@ -1857,6 +1877,20 @@ Status DBImpl::MaybeFlushFinalMemtableOrRestoreActiveLogFiles( return status; } +Status DBImpl::CheckSeqnoNotSetBackDuringRecovery( + SequenceNumber prev_next_seqno, SequenceNumber current_next_seqno) { + if (prev_next_seqno == kMaxSequenceNumber || + prev_next_seqno <= current_next_seqno) { + return Status::OK(); + } + std::string msg = + "Sequence number is being set backwards during recovery, this is likely " + "a software bug or a data corruption. Prev next seqno: " + + std::to_string(prev_next_seqno) + + " , current next seqno: " + std::to_string(current_next_seqno); + return Status::Corruption(msg); +} + void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) { event_logger_.Log() << "job" << job_id << "event" << (status.ok() ? "recovery_finished" : "recovery_failed") @@ -1864,8 +1898,8 @@ void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) { } Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, - LogFileNumberSize* log_ptr) { - LogFileNumberSize log(wal_number); + WalFileNumberSize* log_ptr) { + WalFileNumberSize log(wal_number); std::string fname = LogFileName(immutable_db_options_.GetWalDir(), wal_number); Status s; @@ -1908,27 +1942,27 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { assert(immutable_db_options_.avoid_flush_during_recovery); // Mark these as alive so they'll be considered for deletion later by // FindObsoleteFiles() - total_log_size_ = 0; - log_empty_ = false; + wals_total_size_.StoreRelaxed(0); + wal_empty_ = false; uint64_t min_wal_with_unflushed_data = versions_->MinLogNumberWithUnflushedData(); for (auto wal_number : wal_numbers) { if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) { // In non-2pc mode, the WAL files not backing unflushed data are not - // alive, thus should not be added to the alive_log_files_. + // alive, thus should not be added to the alive_wal_files_. continue; } // We preallocate space for wals, but then after a crash and restart, those // preallocated space are not needed anymore. It is likely only the last // log has such preallocated space, so we only truncate for the last log. - LogFileNumberSize log; + WalFileNumberSize log; s = GetLogSizeAndMaybeTruncate( wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log); if (!s.ok()) { break; } - total_log_size_ += log.size; - alive_log_files_.push_back(log); + wals_total_size_.FetchAddRelaxed(log.size); + alive_wal_files_.push_back(log); } return s; } @@ -1962,6 +1996,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, const size_t ts_sz = ucmp->timestamp_size(); const bool logical_strip_timestamp = ts_sz > 0 && !cfd->ioptions().persist_user_defined_timestamps; + // Note that here we treat flush as level 0 compaction in internal stats + InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, + 1 /* count */); { ScopedArenaPtr iter( logical_strip_timestamp @@ -1989,8 +2026,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.oldest_ancester_time = current_time; meta.epoch_number = cfd->NewEpochNumber(); { - auto write_hint = - cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0); + auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint( + /*level=*/0, + immutable_db_options_.calculate_sst_write_lifetime_hint_set); mutex_.Unlock(); SequenceNumber earliest_write_conflict_snapshot; @@ -2033,19 +2071,20 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, kMaxSequenceNumber); Version* version = cfd->current(); version->Ref(); - uint64_t num_input_entries = 0; - s = BuildTable(dbname_, versions_.get(), immutable_db_options_, tboptions, - file_options_for_compaction_, cfd->table_cache(), - iter.get(), std::move(range_del_iters), &meta, - &blob_file_additions, snapshot_seqs, earliest_snapshot, - earliest_write_conflict_snapshot, kMaxSequenceNumber, - snapshot_checker, paranoid_file_checks, - cfd->internal_stats(), &io_s, io_tracer_, - BlobFileCreationReason::kRecovery, - nullptr /* seqno_to_time_mapping */, &event_logger_, - job_id, nullptr /* table_properties */, write_hint, - nullptr /*full_history_ts_low*/, &blob_callback_, version, - &num_input_entries); + TableProperties temp_table_proerties; + s = BuildTable( + dbname_, versions_.get(), immutable_db_options_, tboptions, + file_options_for_compaction_, cfd->table_cache(), iter.get(), + std::move(range_del_iters), &meta, &blob_file_additions, + snapshot_seqs, earliest_snapshot, earliest_write_conflict_snapshot, + kMaxSequenceNumber, snapshot_checker, paranoid_file_checks, + cfd->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kRecovery, + nullptr /* seqno_to_time_mapping */, &event_logger_, job_id, + &temp_table_proerties /* table_properties */, write_hint, + nullptr /*full_history_ts_low*/, &blob_callback_, version, + nullptr /* memtable_payload_bytes */, + nullptr /* memtable_garbage_bytes */, &flush_stats); version->Unref(); LogFlush(immutable_db_options_.info_log); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, @@ -2061,10 +2100,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, } uint64_t total_num_entries = mem->NumEntries(); - if (s.ok() && total_num_entries != num_input_entries) { + if (s.ok() && total_num_entries != flush_stats.num_input_records) { std::string msg = "Expected " + std::to_string(total_num_entries) + " entries in memtable, but read " + - std::to_string(num_input_entries); + std::to_string(flush_stats.num_input_records); + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] [JOB %d] Level-0 flush during recover: %s", + cfd->GetName().c_str(), job_id, msg.c_str()); + if (immutable_db_options_.flush_verify_memtable_count) { + s = Status::Corruption(msg); + } + } + // Only verify on table with format collects table properties + const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions(); + if (s.ok() && + (mutable_cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName()) || + mutable_cf_options.table_factory->IsInstanceOf( + TableFactory::kPlainTableName())) && + flush_stats.num_output_records != temp_table_proerties.num_entries) { + std::string msg = + "Number of keys in flush output SST files does not match " + "number of keys added to the table. Expected " + + std::to_string(flush_stats.num_output_records) + " but there are " + + std::to_string(temp_table_proerties.num_entries) + + " in output SST files"; ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] [JOB %d] Level-0 flush during recover: %s", cfd->GetName().c_str(), job_id, msg.c_str()); @@ -2112,25 +2172,25 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, } } - InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); - stats.micros = immutable_db_options_.clock->NowMicros() - start_micros; + flush_stats.micros = immutable_db_options_.clock->NowMicros() - start_micros; if (has_output) { - stats.bytes_written = meta.fd.GetFileSize(); - stats.num_output_files = 1; + flush_stats.bytes_written = meta.fd.GetFileSize(); + flush_stats.num_output_files = 1; } const auto& blobs = edit->GetBlobFileAdditions(); for (const auto& blob : blobs) { - stats.bytes_written_blob += blob.GetTotalBlobBytes(); + flush_stats.bytes_written_blob += blob.GetTotalBlobBytes(); } - stats.num_output_files_blob = static_cast(blobs.size()); + flush_stats.num_output_files_blob = static_cast(blobs.size()); - cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats); + cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, + flush_stats); cfd->internal_stats()->AddCFStats( InternalStats::BYTES_FLUSHED, - stats.bytes_written + stats.bytes_written_blob); + flush_stats.bytes_written + flush_stats.bytes_written_blob); RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); return s; } @@ -2204,7 +2264,7 @@ Status DB::OpenAndTrimHistory( return s; } - DB* db = nullptr; + std::unique_ptr db; s = DB::Open(db_options, dbname, column_families, handles, &db); if (!s.ok()) { return s; @@ -2213,7 +2273,7 @@ Status DB::OpenAndTrimHistory( CompactRangeOptions options; options.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - auto db_impl = static_cast_with_check(db); + auto db_impl = static_cast_with_check(db.get()); for (auto handle : *handles) { assert(handle != nullptr); auto cfh = static_cast_with_check(handle); @@ -2235,14 +2295,14 @@ Status DB::OpenAndTrimHistory( assert(temp_s.ok()); } handles->clear(); - delete db; + db.reset(); }; if (!s.ok()) { clean_op(); return s; } - dbptr->reset(db); + *dbptr = std::move(db); return s; } @@ -2258,6 +2318,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options, BuildDBOptions(immutable_db_options_, mutable_db_options_); FileOptions opt_file_options = fs_->OptimizeForLogWrite(file_options_, db_options); + opt_file_options.write_hint = CalculateWALWriteHint(); // DB option takes precedence when not kUnknown if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) { opt_file_options.temperature = immutable_db_options_.wal_write_temperature; @@ -2279,7 +2340,9 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options, } if (io_s.ok()) { - lfile->SetWriteLifeTimeHint(CalculateWALWriteHint()); + // Subsequent attempts to override the hint via SetWriteLifeTimeHint + // with the very same value will be ignored by the fs. + lfile->SetWriteLifeTimeHint(opt_file_options.write_hint); lfile->SetPreallocationBlockSize(preallocate_block_size); const auto& listeners = immutable_db_options_.listeners; @@ -2334,9 +2397,11 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, handles->clear(); size_t max_write_buffer_size = 0; + MinAndMaxPreserveSeconds preserve_info; for (const auto& cf : column_families) { max_write_buffer_size = std::max(max_write_buffer_size, cf.options.write_buffer_size); + preserve_info.Combine(cf.options); } auto impl = std::make_unique(db_options, dbname, seq_per_batch, @@ -2405,18 +2470,18 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { // Prevent log files created by previous instance from being recycled. // They might be in alive_log_file_, and might get recycled otherwise. - impl->min_log_number_to_recycle_ = new_log_number; + impl->min_wal_number_to_recycle_ = new_log_number; } if (s.ok()) { - InstrumentedMutexLock wl(&impl->log_write_mutex_); - impl->logfile_number_ = new_log_number; + InstrumentedMutexLock wl(&impl->wal_write_mutex_); + impl->cur_wal_number_ = new_log_number; assert(new_log != nullptr); assert(impl->logs_.empty()); impl->logs_.emplace_back(new_log_number, new_log); } if (s.ok()) { - impl->alive_log_files_.emplace_back(impl->logfile_number_); + impl->alive_wal_files_.emplace_back(impl->cur_wal_number_); // In WritePrepared there could be gap in sequence numbers. This breaks // the trick we use in kPointInTimeRecovery which assumes the first seq in // the log right after the corrupted log is one larger than the last seq @@ -2429,14 +2494,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (recovered_seq != kMaxSequenceNumber) { WriteBatch empty_batch; WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); - uint64_t log_used, log_size; + uint64_t wal_used, log_size; log::Writer* log_writer = impl->logs_.back().writer; - LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back(); + WalFileNumberSize& wal_file_number_size = impl->alive_wal_files_.back(); - assert(log_writer->get_log_number() == log_file_number_size.number); + assert(log_writer->get_log_number() == wal_file_number_size.number); impl->mutex_.AssertHeld(); - s = impl->WriteToWAL(empty_batch, write_options, log_writer, &log_used, - &log_size, log_file_number_size, recovered_seq); + s = impl->WriteToWAL(empty_batch, write_options, log_writer, &wal_used, + &log_size, wal_file_number_size, recovered_seq); if (s.ok()) { // Need to fsync, otherwise it might get lost after a power reset. s = impl->FlushWAL(write_options, false); @@ -2469,6 +2534,12 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, s = impl->InitPersistStatsColumnFamily(); } + // After reaching the post-recovery seqno but before creating SuperVersions + // ensure seqno to time mapping is pre-populated as needed. + if (s.ok() && recovery_ctx.is_new_db_ && preserve_info.IsEnabled()) { + impl->PrepopulateSeqnoToTimeMapping(preserve_info); + } + if (s.ok()) { // set column family handles for (const auto& cf : column_families) { @@ -2478,6 +2549,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, handles->push_back( new ColumnFamilyHandleImpl(cfd, impl.get(), &impl->mutex_)); impl->NewThreadStatusCfInfo(cfd); + SuperVersionContext sv_context(/* create_superversion */ true); + impl->InstallSuperVersionForConfigChange(cfd, &sv_context); + sv_context.Clean(); } else { if (db_options.create_missing_column_families) { // missing column family, create it @@ -2485,6 +2559,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl->mutex_.Unlock(); // NOTE: the work normally done in WrapUpCreateColumnFamilies will // be done separately below. + // This includes InstallSuperVersionForConfigChange. s = impl->CreateColumnFamilyImpl(read_options, write_options, cf.options, cf.name, &handle); impl->mutex_.Lock(); @@ -2501,15 +2576,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } } - if (s.ok()) { + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + // Install SuperVersion for hidden column family + assert(impl->persist_stats_cf_handle_); + assert(impl->persist_stats_cf_handle_->cfd()); SuperVersionContext sv_context(/* create_superversion */ true); - for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - impl->InstallSuperVersionAndScheduleWork(cfd, &sv_context); - } + impl->InstallSuperVersionForConfigChange( + impl->persist_stats_cf_handle_->cfd(), &sv_context); sv_context.Clean(); - } - - if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { // try to read format version s = impl->PersistentStatsProcessFormatVersion(); } @@ -2618,8 +2692,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, s = impl->StartPeriodicTaskScheduler(); } if (s.ok()) { - s = impl->RegisterRecordSeqnoTimeWorker(read_options, write_options, - recovery_ctx.is_new_db_); + s = impl->RegisterRecordSeqnoTimeWorker(); } impl->options_mutex_.Unlock(); if (s.ok()) { diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index dac0d9660037..31934ee192c7 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -185,16 +185,10 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& _read_options, ? static_cast(read_options.snapshot)->number_ : latest_snapshot; ReadCallback* read_callback = nullptr; // No read callback provided. - auto db_iter = NewArenaWrappedDbIterator( - env_, read_options, cfd->ioptions(), super_version->mutable_cf_options, - super_version->current, read_seq, - super_version->mutable_cf_options.max_sequential_skip_in_iterations, - super_version->version_number, read_callback); - auto internal_iter = NewInternalIterator( - db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), - read_seq, /* allow_unprepared_value */ true, db_iter); - db_iter->SetIterUnderDBIter(internal_iter); - return db_iter; + return NewArenaWrappedDbIterator( + env_, read_options, cfh, super_version, read_seq, read_callback, this, + /*expose_blob_index=*/false, /*allow_refresh=*/false, + /*allow_mark_memtable_for_flush=*/false); } Status DBImplReadOnly::NewIterators( @@ -231,36 +225,32 @@ Status DBImplReadOnly::NewIterators( ? static_cast(read_options.snapshot)->number_ : latest_snapshot; - autovector> cfd_to_sv; + autovector> cfh_to_sv; const bool check_read_ts = read_options.timestamp && read_options.timestamp->size() > 0; for (auto cfh : column_families) { auto* cfd = static_cast_with_check(cfh)->cfd(); auto* sv = cfd->GetSuperVersion()->Ref(); - cfd_to_sv.emplace_back(cfd, sv); + cfh_to_sv.emplace_back(static_cast_with_check(cfh), + sv); if (check_read_ts) { const Status s = FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); if (!s.ok()) { - for (auto prev_entry : cfd_to_sv) { + for (auto prev_entry : cfh_to_sv) { std::get<1>(prev_entry)->Unref(); } return s; } } } - assert(cfd_to_sv.size() == column_families.size()); - for (auto [cfd, sv] : cfd_to_sv) { + assert(cfh_to_sv.size() == column_families.size()); + for (auto [cfh, sv] : cfh_to_sv) { auto* db_iter = NewArenaWrappedDbIterator( - env_, read_options, cfd->ioptions(), sv->mutable_cf_options, - sv->current, read_seq, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_callback); - auto* internal_iter = NewInternalIterator( - db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), read_seq, - /* allow_unprepared_value */ true, db_iter); - db_iter->SetIterUnderDBIter(internal_iter); + env_, read_options, cfh, sv, read_seq, read_callback, this, + /*expose_blob_index=*/false, /*allow_refresh=*/false, + /*allow_mark_memtable_for_flush=*/false); iterators->push_back(db_iter); } diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 9566f547bfeb..2f456561cc30 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -121,6 +121,11 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DBImpl::FlushWAL; + Status FlushWAL(const FlushWALOptions& /*options*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DB::IngestExternalFile; Status IngestExternalFile( ColumnFamilyHandle* /*column_family*/, @@ -155,6 +160,29 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DB::CreateColumnFamily; + using DBImpl::CreateColumnFamily; + Status CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/, + const std::string& /*column_family*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::CreateColumnFamilies; + using DBImpl::CreateColumnFamilies; + Status CreateColumnFamilies( + const ColumnFamilyOptions& /*cf_options*/, + const std::vector& /*column_family_names*/, + std::vector* /*handles*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + Status CreateColumnFamilies( + const std::vector& /*column_families*/, + std::vector* /*handles*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + // FIXME: some missing overrides for more "write" functions protected: diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index a9082db3b42f..0db4820c3925 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -8,7 +8,12 @@ #include #include "db/arena_wrapped_db_iter.h" +#include "db/log_reader.h" +#include "db/log_writer.h" #include "db/merge_context.h" +#include "db/version_edit.h" +#include "file/filename.h" +#include "file/writable_file_writer.h" #include "logging/auto_roll_logger.h" #include "logging/logging.h" #include "monitoring/perf_context_imp.h" @@ -49,9 +54,6 @@ Status DBImplSecondary::Recover( } return s; } - if (immutable_db_options_.paranoid_checks && s.ok()) { - s = CheckConsistency(); - } // Initial max_total_in_memory_state_ before recovery logs. max_total_in_memory_state_ = 0; for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -507,10 +509,6 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& _read_options, if (read_options.io_activity == Env::IOActivity::kUnknown) { read_options.io_activity = Env::IOActivity::kDBIterator; } - if (read_options.managed) { - return NewErrorIterator( - Status::NotSupported("Managed iterator is not supported anymore.")); - } if (read_options.read_tier == kPersistedTier) { return NewErrorIterator(Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators.")); @@ -566,17 +564,10 @@ ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( assert(snapshot == kMaxSequenceNumber); snapshot = versions_->LastSequence(); assert(snapshot != kMaxSequenceNumber); - auto db_iter = NewArenaWrappedDbIterator( - env_, read_options, cfh->cfd()->ioptions(), - super_version->mutable_cf_options, super_version->current, snapshot, - super_version->mutable_cf_options.max_sequential_skip_in_iterations, - super_version->version_number, read_callback, cfh, expose_blob_index, - allow_refresh); - auto internal_iter = NewInternalIterator( - db_iter->GetReadOptions(), cfh->cfd(), super_version, db_iter->GetArena(), - snapshot, /* allow_unprepared_value */ true, db_iter); - db_iter->SetIterUnderDBIter(internal_iter); - return db_iter; + return NewArenaWrappedDbIterator(env_, read_options, cfh, super_version, + snapshot, read_callback, this, + expose_blob_index, allow_refresh, + /*allow_mark_memtable_for_flush=*/false); } Status DBImplSecondary::NewIterators( @@ -593,9 +584,6 @@ Status DBImplSecondary::NewIterators( if (read_options.io_activity == Env::IOActivity::kUnknown) { read_options.io_activity = Env::IOActivity::kDBIterator; } - if (read_options.managed) { - return Status::NotSupported("Managed iterator is not supported anymore."); - } if (read_options.read_tier == kPersistedTier) { return Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators."); @@ -660,58 +648,15 @@ Status DBImplSecondary::NewIterators( return Status::OK(); } -Status DBImplSecondary::CheckConsistency() { - mutex_.AssertHeld(); - Status s = DBImpl::CheckConsistency(); - // If DBImpl::CheckConsistency() which is stricter returns success, then we - // do not need to give a second chance. - if (s.ok()) { - return s; - } - // It's possible that DBImpl::CheckConssitency() can fail because the primary - // may have removed certain files, causing the GetFileSize(name) call to - // fail and returning a PathNotFound. In this case, we take a best-effort - // approach and just proceed. - TEST_SYNC_POINT_CALLBACK( - "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s); - - if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { - return Status::OK(); - } - - std::vector metadata; - versions_->GetLiveFilesMetaData(&metadata); - - std::string corruption_messages; - for (const auto& md : metadata) { - // md.name has a leading "/". - std::string file_path = md.db_path + md.name; - - uint64_t fsize = 0; - s = env_->GetFileSize(file_path, &fsize); - if (!s.ok() && - (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() || - s.IsPathNotFound())) { - s = Status::OK(); - } - if (!s.ok()) { - corruption_messages += - "Can't access " + md.name + ": " + s.ToString() + "\n"; - } - } - return corruption_messages.empty() ? Status::OK() - : Status::Corruption(corruption_messages); -} - Status DBImplSecondary::TryCatchUpWithPrimary() { assert(versions_.get() != nullptr); - assert(manifest_reader_.get() != nullptr); Status s; // read the manifest and apply new changes to the secondary instance std::unordered_set cfds_changed; JobContext job_context(0, true /*create_superversion*/); { InstrumentedMutexLock lock_guard(&mutex_); + assert(manifest_reader_.get() != nullptr); s = static_cast_with_check(versions_.get()) ->ReadAndApply(&mutex_, &manifest_reader_, manifest_reader_status_.get(), &cfds_changed, @@ -735,13 +680,13 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { // instance if (s.ok()) { s = FindAndRecoverLogFiles(&cfds_changed, &job_context); - } - if (s.IsPathNotFound()) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "Secondary tries to read WAL, but WAL file(s) have already " - "been purged by primary."); - s = Status::OK(); + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } } if (s.ok()) { for (auto cfd : cfds_changed) { @@ -831,9 +776,9 @@ Status DB::OpenAsSecondary( handles->clear(); DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path); impl->versions_.reset(new ReactiveVersionSet( - dbname, &impl->immutable_db_options_, impl->file_options_, - impl->table_cache_.get(), impl->write_buffer_manager_, - &impl->write_controller_, impl->io_tracer_)); + dbname, &impl->immutable_db_options_, impl->mutable_db_options_, + impl->file_options_, impl->table_cache_.get(), + impl->write_buffer_manager_, &impl->write_controller_, impl->io_tracer_)); impl->column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); @@ -876,18 +821,517 @@ Status DB::OpenAsSecondary( return s; } +Status DBImplSecondary::ScanCompactionProgressFiles( + CompactionProgressFilesScan* scan_result) { + assert(scan_result != nullptr); + scan_result->Clear(); + + WriteOptions write_options(Env::IOActivity::kCompaction); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!s.ok()) { + return s; + } + + std::vector all_filenames; + s = fs_->GetChildren(secondary_path_, opts, &all_filenames, nullptr /* dbg*/); + if (!s.ok()) { + return s; + } + + for (const auto& filename : all_filenames) { + if (filename == "." || filename == "..") { + continue; + } + + uint64_t number; + FileType type; + + if (!ParseFileName(filename, &number, &type)) { + continue; + } + + // Categorize compaction progress files + if (type == kCompactionProgressFile) { + if (number > scan_result->latest_progress_timestamp) { + // Found a newer progress file + if (scan_result->HasLatestProgressFile()) { + // Previous "latest" becomes "old" + scan_result->old_progress_filenames.push_back( + scan_result->latest_progress_filename.value()); + } + scan_result->latest_progress_timestamp = number; + scan_result->latest_progress_filename = filename; + } else { + // This is an older progress file + scan_result->old_progress_filenames.push_back(filename); + } + } else if (type == kTempFile && + filename.find(kCompactionProgressFileNamePrefix) == 0) { + // Temporary progress files + scan_result->temp_progress_filenames.push_back(filename); + } else if (type == kTableFile) { + // Collect table file numbers for CleanupPhysicalCompactionOutputFiles + scan_result->table_file_numbers.push_back(number); + } + } + + return Status::OK(); +} + +Status DBImplSecondary::DeleteCompactionProgressFiles( + const std::vector& filenames) { + WriteOptions write_options(Env::IOActivity::kCompaction); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!s.ok()) { + return s; + } + + for (const auto& filename : filenames) { + std::string file_path = secondary_path_ + "/" + filename; + Status delete_status = fs_->DeleteFile(file_path, opts, nullptr /* dbg */); + if (!delete_status.ok()) { + return delete_status; + } + } + + return Status::OK(); +} + +Status DBImplSecondary::CleanupOldAndTemporaryCompactionProgressFiles( + bool preserve_latest, const CompactionProgressFilesScan& scan_result) { + std::vector filenames_to_delete; + + // Always delete old progress files + filenames_to_delete.insert(filenames_to_delete.end(), + scan_result.old_progress_filenames.begin(), + scan_result.old_progress_filenames.end()); + + // Always delete temp files + filenames_to_delete.insert(filenames_to_delete.end(), + scan_result.temp_progress_filenames.begin(), + scan_result.temp_progress_filenames.end()); + + // Conditionally delete latest file + if (!preserve_latest && scan_result.HasLatestProgressFile()) { + filenames_to_delete.push_back(scan_result.latest_progress_filename.value()); + } + + return DeleteCompactionProgressFiles(filenames_to_delete); +} + +// Loads compaction progress from a file and cleans up extra output +// files. After loading the progress, this function identifies and deletes any +// SST files in the output folder that are NOT tracked in the +// progress. This ensures consistency between the progress file and +// actual output files on disk. +Status DBImplSecondary::LoadCompactionProgressAndCleanupExtraOutputFiles( + const std::string& compaction_progress_file_path, + const CompactionProgressFilesScan& scan_result) { + Status s = ParseCompactionProgressFile(compaction_progress_file_path, + &compaction_progress_); + if (s.ok()) { + s = CleanupPhysicalCompactionOutputFiles(true /* preserve_tracked_files */, + scan_result); + } + return s; +} + +Status DBImplSecondary::ParseCompactionProgressFile( + const std::string& compaction_progress_file_path, + CompactionProgress* compaction_progress) { + std::unique_ptr file; + Status s = fs_->NewSequentialFile(compaction_progress_file_path, + FileOptions(), &file, nullptr /* dbg */); + if (!s.ok()) { + return s; + } + + std::unique_ptr file_reader(new SequentialFileReader( + std::move(file), compaction_progress_file_path, + immutable_db_options_.log_readahead_size, io_tracer_, {} /* listeners */, + immutable_db_options_.rate_limiter.get())); + + Status reader_status; + + struct CompactionProgressReaderReporter : public log::Reader::Reporter { + Status* status; + explicit CompactionProgressReaderReporter(Status* s) : status(s) {} + + void Corruption(size_t /*bytes*/, const Status& s, + uint64_t /*log_number*/) override { + if (status->ok()) { + *status = s; + } + } + + void OldLogRecord(size_t /*bytes*/) override { + // Ignore old records + } + } progress_reporter(&reader_status); + + log::Reader compaction_progress_reader( + immutable_db_options_.info_log, std::move(file_reader), + &progress_reporter, true /* checksum */, 0 /* log_num */); + + // LIMITATION: Only supports resuming single subcompaction + SubcompactionProgressBuilder progress_builder; + Slice slice; + std::string record; + + while (compaction_progress_reader.ReadRecord(&slice, &record) && + reader_status.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(slice); + if (!s.ok()) { + break; + } + + bool res = progress_builder.ProcessVersionEdit(edit); + if (!res) { + break; + } + } + + if (!reader_status.ok()) { + return reader_status; + } + + if (!s.ok()) { + return s; + } + + if (progress_builder.HasAccumulatedSubcompactionProgress()) { + compaction_progress->clear(); + compaction_progress->push_back( + progress_builder.GetAccumulatedSubcompactionProgress()); + } else { + s = Status::NotFound("No compaction progress was persisted yet"); + } + + return s; +} + +Status DBImplSecondary::RenameCompactionProgressFile( + const std::string& temp_file_path, std::string* final_file_path) { + uint64_t current_time = env_->NowMicros(); + *final_file_path = CompactionProgressFileName(secondary_path_, current_time); + + WriteOptions write_options(Env::IOActivity::kCompaction); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!s.ok()) { + return s; + } + + s = fs_->RenameFile(temp_file_path, *final_file_path, opts, + nullptr /* dbg */); + + return s; +} + +Status DBImplSecondary::CleanupPhysicalCompactionOutputFiles( + bool preserve_tracked_files, + const CompactionProgressFilesScan& scan_result) { + std::unordered_set files_to_preserve; + + if (preserve_tracked_files) { + for (const auto& subcompaction_progress : compaction_progress_) { + for (const auto& file_metadata : + subcompaction_progress.output_level_progress.GetOutputFiles()) { + files_to_preserve.insert(file_metadata.fd.GetNumber()); + } + for (const auto& file_metadata : + subcompaction_progress.proximal_output_level_progress + .GetOutputFiles()) { + files_to_preserve.insert(file_metadata.fd.GetNumber()); + } + } + } + + WriteOptions write_options(Env::IOActivity::kCompaction); + IOOptions opts; + Status s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!s.ok()) { + return s; + } + + for (uint64_t file_number : scan_result.table_file_numbers) { + bool should_delete = + !preserve_tracked_files || + (files_to_preserve.find(file_number) == files_to_preserve.end()); + + if (should_delete) { + std::string file_path = MakeTableFileName(secondary_path_, file_number); + Status delete_status = + fs_->DeleteFile(file_path, opts, nullptr /* dbg */); + if (!delete_status.ok()) { + return delete_status; + } + } + } + + return Status::OK(); +} + +Status DBImplSecondary::InitializeCompactionWorkspace( + bool allow_resumption, std::unique_ptr* output_dir, + std::unique_ptr* compaction_progress_writer) { + // Create output directory if it doest exist yet + Status s = CreateAndNewDirectory(fs_.get(), secondary_path_, output_dir); + if (!s.ok() || !allow_resumption) { + return s; + } + + s = PrepareCompactionProgressState(); + + if (!s.ok()) { + return s; + } + + s = FinalizeCompactionProgressWriter(compaction_progress_writer); + + if (!s.ok()) { + return s; + } + + return Status::OK(); +} + +// PrepareCompactionProgressState() manages compaction progress files and output +// files to ensure a clean, consistent state for resuming or starting fresh +// compaction. +// +// PRECONDITION: +// - This function is ONLY called when allow_resumption = true +// - The caller wants resumption support for this compaction attempt +// +// FILE SYSTEM STATE (before entering this function): +// - 0 or more compaction progress files may exist in `secondary_path_`: +// * Latest progress file (from the most recent compaction attempt) +// * Older progress files (left by crashing during a previous +// InitializeCompactionWorkspace() call) +// * Temporary progress files (left by crashing during a previous +// InitializeCompactionWorkspace() call) +// - 0 or more compaction output files may exist in `secondary_path_` +// +// POSTCONDITIONS (after this function): +// - IF the latest progress file exists AND it parses successfully AND +// actually contains valid compaction progress: +// * Exactly one latest progress file remains +// * All older and temporary compaction progress files are deleted +// * All corresponding compaction output files are preserved +// * All extra compaction output files are deleted (files left by +// compaction +// crashing before persisting the progress) +// * Result: Ready to resume compaction from the saved progress +// - OTHERWISE (no latest progress file OR it fails to parse OR it's +// invalid): +// * ALL compaction progress files are deleted (latest + older + +// temporary) +// * ALL compaction output files are deleted +// * Result: Ready to start fresh compaction (despite allow_resumption = +// true, we cannot resume because there's no valid progress to resume from) +// +// ERROR HANDLING: +// - ON ERROR (if any of the postconditions cannot be achieved): +// * Function returns error status +// * File system may be left in a partially modified state +// * Caller should manually clean up secondary_path_ before retrying +// * Subsequent OpenAndCompact() calls to this clean secondary_path_ will +// effectively start fresh compaction +Status DBImplSecondary::PrepareCompactionProgressState() { + Status s; + + // STEP 1: Scan directory ONCE (includes progress files + table files) + CompactionProgressFilesScan scan_result; + s = ScanCompactionProgressFiles(&scan_result); + if (!s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Encountered error when scanning for compaction " + "progress files: %s", + s.ToString().c_str()); + return s; + } + + std::optional latest_progress_file = + scan_result.latest_progress_filename; + + // STEP 2: Determine if we should resume + bool should_resume = false; + if (latest_progress_file.has_value()) { + should_resume = true; + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Did not find any latest compaction progress file. " + "Will perform clean up to start fresh compaction"); + } + + // STEP 3: Cleanup using pre-scanned results + if (should_resume) { + // Keep latest, delete old/temp + s = CleanupOldAndTemporaryCompactionProgressFiles( + true /* preserve_latest */, scan_result); + } else { + // Delete everything including latest + s = CleanupOldAndTemporaryCompactionProgressFiles( + false /* preserve_latest */, scan_result); + latest_progress_file.reset(); + } + + if (!s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Failed to clean up compaction progress file(s): %s. " + "Will fail the compaction", + s.ToString().c_str()); + return s; + } + + // STEP 4: Load progress if resuming + if (latest_progress_file.has_value()) { + uint64_t timestamp = scan_result.latest_progress_timestamp; + + std::string compaction_progress_file_path = + CompactionProgressFileName(secondary_path_, timestamp); + + s = LoadCompactionProgressAndCleanupExtraOutputFiles( + compaction_progress_file_path, scan_result); + + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to load the latest compaction " + "progress from %s: %s. Will perform clean up " + "to start fresh compaction", + latest_progress_file.value().c_str(), + s.ToString().c_str()); + return HandleInvalidOrNoCompactionProgress(compaction_progress_file_path, + scan_result); + } + + ROCKS_LOG_DEBUG( + immutable_db_options_.info_log, + "Loaded compaction progress with %zu subcompaction(s) from %s", + compaction_progress_.size(), compaction_progress_file_path.c_str()); + return s; + } else { + return HandleInvalidOrNoCompactionProgress( + std::nullopt /* compaction_progress_file_path */, scan_result); + } +} + +uint64_t DBImplSecondary::CalculateResumedCompactionBytes( + const CompactionProgress& compaction_progress) const { + uint64_t total_resumed_bytes = 0; + + for (const auto& subcompaction_progress : compaction_progress) { + for (const auto& file_meta : + subcompaction_progress.output_level_progress.GetOutputFiles()) { + total_resumed_bytes += file_meta.fd.file_size; + } + + for (const auto& file_meta : + subcompaction_progress.proximal_output_level_progress + .GetOutputFiles()) { + total_resumed_bytes += file_meta.fd.file_size; + } + } + + return total_resumed_bytes; +} + +Status DBImplSecondary::HandleInvalidOrNoCompactionProgress( + const std::optional& compaction_progress_file_path, + const CompactionProgressFilesScan& scan_result) { + compaction_progress_.clear(); + + Status s; + if (compaction_progress_file_path.has_value()) { + WriteOptions write_options(Env::IOActivity::kCompaction); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = fs_->DeleteFile(compaction_progress_file_path.value(), opts, + nullptr /* dbg */); + } + if (!s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Failed to remove invalid progress file: %s", + s.ToString().c_str()); + return s; + } + } + + s = CleanupPhysicalCompactionOutputFiles(false /* preserve_tracked_files */, + scan_result); + if (!s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Failed to cleanup existing compaction output files: %s", + s.ToString().c_str()); + return s; + } + + return Status::OK(); +} + Status DBImplSecondary::CompactWithoutInstallation( const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh, const CompactionServiceInput& input, CompactionServiceResult* result) { if (options.canceled && options.canceled->load(std::memory_order_acquire)) { return Status::Incomplete(Status::SubCode::kManualCompactionPaused); } + + std::unique_ptr output_dir; + std::unique_ptr compaction_progress_writer; + InstrumentedMutexLock l(&mutex_); + auto cfd = static_cast_with_check(cfh)->cfd(); if (!cfd) { return Status::InvalidArgument("Cannot find column family" + cfh->GetName()); } + Status s; + + const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions(); + + // TODO(hx235): Resuming compaction is currently incompatible with + // output hash verification (enabled via paranoid_file_checks=true or + // verify_output_flags containing kVerifyIteration) because resumed compaction + // will lose the hash computed before interruption. + // Potential solutions: + // 1. Persist the hash state: Before interruption, save the current hash value + // of each output file to disk, allowing validation to continue correctly + // after resumption. + // 2. Immediate verification: Move output verification to happen + // immediately after each output file is created and closed, eliminating + // the need to maintain hash state across resumption boundaries. + bool output_hash_verification_enabled = + mutable_cf_options.paranoid_file_checks || + !!(mutable_cf_options.verify_output_flags & + VerifyOutputFlags::kVerifyIteration); + + bool allow_resumption = + options.allow_resumption && !output_hash_verification_enabled; + + if (options.allow_resumption && output_hash_verification_enabled) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Resume compaction configured but disabled due to " + "incompatibility with output hash verification " + "(paranoid_file_checks=true or verify_output_flags " + "containing kVerifyIteration)"); + } + + mutex_.Unlock(); + + s = InitializeCompactionWorkspace(allow_resumption, &output_dir, + &compaction_progress_writer); + + mutex_.Lock(); + + if (!s.ok()) { + return s; + } std::unordered_set input_set; for (const auto& file_name : input.input_files) { @@ -901,46 +1345,56 @@ Status DBImplSecondary::CompactWithoutInstallation( VersionStorageInfo* vstorage = version->storage_info(); - // Use comp_options to reuse some CompactFiles functions CompactionOptions comp_options; comp_options.compression = kDisableCompressionOption; comp_options.output_file_size_limit = MaxFileSizeForLevel( - cfd->GetLatestMutableCFOptions(), input.output_level, - cfd->ioptions().compaction_style, vstorage->base_level(), + mutable_cf_options, input.output_level, cfd->ioptions().compaction_style, + vstorage->base_level(), cfd->ioptions().level_compaction_dynamic_level_bytes); std::vector input_files; - Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( &input_files, &input_set, vstorage, comp_options); if (!s.ok()) { ROCKS_LOG_ERROR( immutable_db_options_.info_log, "GetCompactionInputsFromFileNumbers() failed - %s.\n DebugString: %s", - s.ToString().c_str(), version->DebugString().c_str()); + s.ToString().c_str(), version->DebugString(/*hex=*/true).c_str()); return s; } + const int job_id = next_job_id_.fetch_add(1); + JobContext job_context(job_id, true /*create_superversion*/); + std::vector snapshots = input.snapshots; + + // TODO - snapshot_checker support in Remote Compaction + job_context.InitSnapshotContext(/*checker=*/nullptr, + /*managed_snapshot=*/nullptr, + kMaxSequenceNumber, std::move(snapshots)); + + // TODO - consider serializing the entire Compaction object and using it as + // input instead of recreating it in the remote worker std::unique_ptr c; assert(cfd->compaction_picker()); - c.reset(cfd->compaction_picker()->CompactFiles( + std::optional earliest_snapshot = std::nullopt; + // Standalone Range Deletion Optimization is only supported in Universal + // Compactions - https://github.com/facebook/rocksdb/pull/13078 + if (cfd->GetLatestCFOptions().compaction_style == + CompactionStyle::kCompactionStyleUniversal) { + earliest_snapshot = !job_context.snapshot_seqs.empty() + ? job_context.snapshot_seqs.front() + : kMaxSequenceNumber; + } + c.reset(cfd->compaction_picker()->PickCompactionForCompactFiles( comp_options, input_files, input.output_level, vstorage, - cfd->GetLatestMutableCFOptions(), mutable_db_options_, 0)); + mutable_cf_options, mutable_db_options_, 0, earliest_snapshot, + job_context.snapshot_checker)); assert(c != nullptr); - c->FinalizeInputInfo(version); - // Create output directory if it's not existed yet - std::unique_ptr output_dir; - s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir); - if (!s.ok()) { - return s; - } - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log.get()); - const int job_id = next_job_id_.fetch_add(1); - // use primary host's db_id for running the compaction, but db_session_id is // using the local one, which is to make sure the unique id is unique from // the remote compactors. Because the id is generated from db_id, @@ -951,17 +1405,19 @@ Status DBImplSecondary::CompactWithoutInstallation( job_id, c.get(), immutable_db_options_, mutable_db_options_, file_options_for_compaction_, versions_.get(), &shutting_down_, &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_, - input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_, + &job_context, table_cache_, &event_logger_, dbname_, io_tracer_, options.canceled ? *options.canceled : kManualCompactionCanceledFalse_, input.db_id, db_session_id_, secondary_path_, input, result); - compaction_job.Prepare(); + compaction_job.Prepare(compaction_progress_, + compaction_progress_writer.get()); mutex_.Unlock(); s = compaction_job.Run(); mutex_.Lock(); - // clean up + // These cleanup functions handle metadata and state cleanup only and + // not the physical files compaction_job.io_status().PermitUncheckedError(); compaction_job.CleanupCompaction(); c->ReleaseCompactionFiles(s); @@ -969,6 +1425,18 @@ Status DBImplSecondary::CompactWithoutInstallation( TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End", &s); + + if (!compaction_progress_.empty() && s.ok()) { + uint64_t total_resumed_bytes = + CalculateResumedCompactionBytes(compaction_progress_); + + if (total_resumed_bytes > 0 && + immutable_db_options_.statistics != nullptr) { + RecordTick(immutable_db_options_.statistics.get(), + REMOTE_COMPACT_RESUMED_BYTES, total_resumed_bytes); + } + } + result->status = s; return s; } @@ -991,9 +1459,10 @@ Status DB::OpenAndCompact( } // 2. Load the options - DBOptions db_options; + DBOptions base_db_options; ConfigOptions config_options; config_options.env = override_options.env; + config_options.ignore_unknown_options = true; std::vector all_column_families; TEST_SYNC_POINT_CALLBACK( @@ -1003,13 +1472,22 @@ Status DB::OpenAndCompact( std::string options_file_name = OptionsFileName(name, compaction_input.options_file_number); - s = LoadOptionsFromFile(config_options, options_file_name, &db_options, + s = LoadOptionsFromFile(config_options, options_file_name, &base_db_options, &all_column_families); if (!s.ok()) { return s; } - // 3. Override pointer configurations in DBOptions with + // 3. Options to Override + // Override serializable configurations from override_options.options_map + DBOptions db_options; + s = GetDBOptionsFromMap(config_options, base_db_options, + override_options.options_map, &db_options); + if (!s.ok()) { + return s; + } + + // Override options that are directly set as shared ptrs in // CompactionServiceOptionsOverride db_options.env = override_options.env; db_options.file_checksum_gen_factory = @@ -1020,6 +1498,7 @@ Status DB::OpenAndCompact( // We will close the DB after the compaction anyway. // Open as many files as needed for the compaction. db_options.max_open_files = -1; + db_options.info_log = override_options.info_log; // 4. Filter CFs that are needed for OpenAndCompact() // We do not need to open all column families for the remote compaction. @@ -1029,6 +1508,18 @@ Status DB::OpenAndCompact( std::vector column_families; for (auto& cf : all_column_families) { if (cf.name == compaction_input.cf_name) { + ColumnFamilyOptions cf_options; + // Override serializable configurations from override_options.options_map + s = GetColumnFamilyOptionsFromMap(config_options, cf.options, + override_options.options_map, + &cf_options); + if (!s.ok()) { + return s; + } + cf.options = std::move(cf_options); + + // Override options that are directly set as shared ptrs in + // CompactionServiceOptionsOverride cf.options.comparator = override_options.comparator; cf.options.merge_operator = override_options.merge_operator; cf.options.compaction_filter = override_options.compaction_filter; @@ -1040,6 +1531,7 @@ Status DB::OpenAndCompact( override_options.sst_partitioner_factory; cf.options.table_properties_collector_factories = override_options.table_properties_collector_factories; + column_families.emplace_back(cf); } else if (cf.name == kDefaultColumnFamilyName) { column_families.emplace_back(cf); @@ -1047,7 +1539,7 @@ Status DB::OpenAndCompact( } // 5. Open db As Secondary - DB* db; + std::unique_ptr db; std::vector handles; s = DB::OpenAsSecondary(db_options, name, output_directory, column_families, &handles, &db); @@ -1056,6 +1548,9 @@ Status DB::OpenAndCompact( } assert(db); + TEST_SYNC_POINT_CALLBACK( + "DBImplSecondary::OpenAndCompact::AfterOpenAsSecondary:0", db.get()); + // 6. Find the handle of the Column Family that this will compact ColumnFamilyHandle* cfh = nullptr; for (auto* handle : handles) { @@ -1069,7 +1564,8 @@ Status DB::OpenAndCompact( // 7. Run the compaction without installation. // Output will be stored in the directory specified by output_directory CompactionServiceResult compaction_result; - DBImplSecondary* db_secondary = static_cast_with_check(db); + DBImplSecondary* db_secondary = + static_cast_with_check(db.get()); s = db_secondary->CompactWithoutInstallation(options, cfh, compaction_input, &compaction_result); @@ -1080,7 +1576,7 @@ Status DB::OpenAndCompact( for (auto& handle : handles) { delete handle; } - delete db; + db.reset(); if (s.ok()) { return serialization_status; } else { @@ -1097,4 +1593,153 @@ Status DB::OpenAndCompact( output, override_options); } +Status DBImplSecondary::CreateCompactionProgressWriter( + const std::string& file_path, + std::unique_ptr* compaction_progress_writer) { + std::unique_ptr file; + Status s = + fs_->NewWritableFile(file_path, FileOptions(), &file, nullptr /* dbg */); + if (!s.ok()) { + return s; + } + + std::unique_ptr file_writer( + new WritableFileWriter(std::move(file), file_path, FileOptions())); + + compaction_progress_writer->reset( + new log::Writer(std::move(file_writer), 0 /* log_number */, + false /* recycle_log_files */)); + + return Status::OK(); +} + +Status DBImplSecondary::PersistInitialCompactionProgress( + log::Writer* compaction_progress_writer, + const CompactionProgress& compaction_progress) { + assert(compaction_progress_writer); + + // LIMITATION: Only supports resuming single subcompaction + assert(compaction_progress.size() == 1); + const SubcompactionProgress& subcompaction_progress = compaction_progress[0]; + + VersionEdit edit; + edit.SetSubcompactionProgress(subcompaction_progress); + + std::string record; + if (!edit.EncodeTo(&record)) { + return Status::IOError("Failed to encode the initial compaction progress"); + } + + WriteOptions write_options(Env::IOActivity::kCompaction); + Status s = compaction_progress_writer->AddRecord(write_options, record); + if (!s.ok()) { + return s; + } + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (!s.ok()) { + return s; + } + + s = compaction_progress_writer->file()->Sync(opts, + immutable_db_options_.use_fsync); + + return s; +} + +Status DBImplSecondary::HandleCompactionProgressWriterCreationFailure( + const std::string& temp_file_path, const std::string& final_file_path, + std::unique_ptr* compaction_progress_writer) { + compaction_progress_writer->reset(); + + const std::vector paths_to_delete = {final_file_path, + temp_file_path}; + + Status s; + for (const auto& file_path : paths_to_delete) { + WriteOptions write_options(Env::IOActivity::kCompaction); + IOOptions opts; + s = WritableFileWriter::PrepareIOOptions(write_options, opts); + if (s.ok()) { + s = fs_->DeleteFile(file_path, opts, nullptr /* dbg */); + } + + if (!s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Failed to cleanup the compaction progress file " + "during writer creation failure: %s", + s.ToString().c_str()); + return s; + } + } + + return s; +} + +Status DBImplSecondary::FinalizeCompactionProgressWriter( + std::unique_ptr* compaction_progress_writer) { + uint64_t timestamp = env_->NowMicros(); + const std::string temp_file_path = + TempCompactionProgressFileName(secondary_path_, timestamp); + + Status s = CreateCompactionProgressWriter(temp_file_path, + compaction_progress_writer); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to create compaction progress writer at " + "temp path %s: %s. Will perform clean up " + "to start compaction without progress persistence", + temp_file_path.c_str(), s.ToString().c_str()); + return HandleCompactionProgressWriterCreationFailure( + temp_file_path, "" /* final_file_path */, compaction_progress_writer); + } + + if (!compaction_progress_.empty()) { + s = PersistInitialCompactionProgress(compaction_progress_writer->get(), + compaction_progress_); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to persist the initial copmaction " + "progress: %s. Will perform clean up " + "to start compaction without progress persistence", + s.ToString().c_str()); + return HandleCompactionProgressWriterCreationFailure( + temp_file_path, "" /* final_file_path */, compaction_progress_writer); + } + } + + compaction_progress_writer->reset(); + + std::string final_file_path; + s = RenameCompactionProgressFile(temp_file_path, &final_file_path); + + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to rename temporary compaction progress " + "file from %s to %s: %s. Will perform clean up " + "to start compaction without progress persistence", + temp_file_path.c_str(), final_file_path.c_str(), + s.ToString().c_str()); + return HandleCompactionProgressWriterCreationFailure( + temp_file_path, final_file_path, compaction_progress_writer); + } + + s = CreateCompactionProgressWriter(final_file_path, + compaction_progress_writer); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to create the final compaction progress " + "writer: %s. Will attempt clean to start the compaction " + "without progress persistence", + s.ToString().c_str()); + return HandleCompactionProgressWriterCreationFailure( + "" /* temp_file_path */, final_file_path, compaction_progress_writer); + } + + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "Finalized compaction progress writer onto %s", + final_file_path.c_str()); + + return Status::OK(); +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index c0d72c67e9f4..583b4081b3bc 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -216,9 +216,9 @@ class DBImplSecondary : public DBImpl { using DBImpl::SetOptions; Status SetOptions( - ColumnFamilyHandle* /*cfd*/, - const std::unordered_map& /*options_map*/) - override { + const std::unordered_map>& + /*column_families_opts_map*/) override { // Currently not supported because changing certain options may cause // flush/compaction and/or write to MANIFEST. return Status::NotSupported("Not supported operation in secondary mode."); @@ -248,12 +248,6 @@ class DBImplSecondary : public DBImpl { Status MaybeInitLogReader(uint64_t log_number, log::FragmentBufferedReader** log_reader); - // Check if all live files exist on file system and that their file sizes - // matche to the in-memory records. It is possible that some live files may - // have been deleted by the primary. In this case, CheckConsistency() does - // not flag the missing file as inconsistency. - Status CheckConsistency() override; - #ifndef NDEBUG Status TEST_CompactWithoutInstallation(const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh, @@ -309,6 +303,87 @@ class DBImplSecondary : public DBImpl { const CompactionServiceInput& input, CompactionServiceResult* result); + private: + // Holds results of compaction progress files and output files from a single + // directory scan + struct CompactionProgressFilesScan { + // The latest (newest) progress file filename + std::optional latest_progress_filename; + uint64_t latest_progress_timestamp = 0; + + // Older progress file filenames (to be deleted) + autovector old_progress_filenames; + + // Temporary progress file filenames (to be deleted) + autovector temp_progress_filenames; + + // All output file numbers - for cleanup optimization + std::vector table_file_numbers; + + bool HasLatestProgressFile() const { + return latest_progress_filename.has_value(); + } + + void Clear() { + latest_progress_filename.reset(); + latest_progress_timestamp = 0; + old_progress_filenames.clear(); + temp_progress_filenames.clear(); + table_file_numbers.clear(); + } + }; + + Status InitializeCompactionWorkspace( + bool allow_resumption, std::unique_ptr* output_dir, + std::unique_ptr* compaction_progress_writer); + + Status PrepareCompactionProgressState(); + + Status ScanCompactionProgressFiles(CompactionProgressFilesScan* scan_result); + + Status DeleteCompactionProgressFiles( + const std::vector& filenames); + + Status CleanupOldAndTemporaryCompactionProgressFiles( + bool preserve_latest, const CompactionProgressFilesScan& scan_result); + + Status LoadCompactionProgressAndCleanupExtraOutputFiles( + const std::string& compaction_progress_file_path, + const CompactionProgressFilesScan& scan_result); + + Status ParseCompactionProgressFile( + const std::string& compaction_progress_file_path, + CompactionProgress* compaction_progress); + + Status HandleInvalidOrNoCompactionProgress( + const std::optional& compaction_progress_file_path, + const CompactionProgressFilesScan& scan_result); + + Status CleanupPhysicalCompactionOutputFiles( + bool preserve_tracked_files, + const CompactionProgressFilesScan& scan_result); + + Status FinalizeCompactionProgressWriter( + std::unique_ptr* compaction_progress_writer); + + Status CreateCompactionProgressWriter( + const std::string& file_path, + std::unique_ptr* compaction_progress_writer); + + Status PersistInitialCompactionProgress( + log::Writer* compaction_progress_writer, + const CompactionProgress& compaction_progress); + + Status RenameCompactionProgressFile(const std::string& temp_file_path, + std::string* final_file_path); + + Status HandleCompactionProgressWriterCreationFailure( + const std::string& temp_file_path, const std::string& final_file_path, + std::unique_ptr* compaction_progress_writer); + + uint64_t CalculateResumedCompactionBytes( + const CompactionProgress& compaction_progress) const; + // Cache log readers for each log number, used for continue WAL replay // after recovery std::map> log_readers_; @@ -317,6 +392,8 @@ class DBImplSecondary : public DBImpl { std::unordered_map cfd_to_current_log_; const std::string secondary_path_; + + CompactionProgress compaction_progress_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 7051c970aad7..8a4c5ec9be6c 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -157,7 +157,7 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { if (s.ok()) { s = WriteImpl(write_options, my_batch, /*callback=*/nullptr, /*user_write_cb=*/nullptr, - /*log_used=*/nullptr); + /*wal_used=*/nullptr); } return s; } @@ -190,11 +190,38 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options, return s; } -Status DBImpl::IngestWBWI(std::shared_ptr wbwi, - const WBWIMemTable::SeqnoRange& assigned_seqno, - uint64_t prep_log, - SequenceNumber last_seqno_after_ingest, - bool memtable_updated, bool ignore_missing_cf) { +Status DBImpl::IngestWriteBatchWithIndex( + const WriteOptions& write_options, + std::shared_ptr wbwi) { + if (!wbwi) { + return Status::InvalidArgument("Batch is nullptr!"); + } + if (!write_options.disableWAL) { + return Status::NotSupported( + "IngestWriteBatchWithIndex does not support disableWAL=true"); + } + Status s; + if (write_options.protection_bytes_per_key > 0) { + s = WriteBatchInternal::UpdateProtectionInfo( + wbwi->GetWriteBatch(), write_options.protection_bytes_per_key); + } + if (s.ok()) { + WriteBatch dummy_empty_batch; + s = WriteImpl( + write_options, /*updates=*/&dummy_empty_batch, /*callback=*/nullptr, + /*user_write_cb=*/nullptr, /*log_used=*/nullptr, /*log_ref=*/0, + /*disable_memtable=*/false, /*seq_used=*/nullptr, + /*batch_cnt=*/0, /*pre_release_callback=*/nullptr, + /*post_memtable_callback=*/nullptr, /*wbwi=*/wbwi); + } + return s; +} + +Status DBImpl::IngestWBWIAsMemtable( + std::shared_ptr wbwi, + const WBWIMemTable::SeqnoRange& assigned_seqno, uint64_t min_prep_log, + SequenceNumber last_seqno_after_ingest, bool memtable_updated, + bool ignore_missing_cf) { // Keys in new memtable have seqno > last_seqno_after_ingest >= keys in wbwi. assert(assigned_seqno.upper_bound <= last_seqno_after_ingest); // Keys in the current memtable have seqno <= LastSequence() < keys in wbwi. @@ -238,12 +265,30 @@ Status DBImpl::IngestWBWI(std::shared_ptr wbwi, wbwi_memtable->AssignSequenceNumbers(assigned_seqno); // This is needed to keep the WAL that contains Prepare alive until // committed data in this memtable is persisted. - wbwi_memtable->SetMinPrepLog(prep_log); + wbwi_memtable->SetMinPrepLog(min_prep_log); memtables.push_back(wbwi_memtable); cfd->Ref(); cfds.push_back(cfd); } + autovector cfds_for_atomic_flush; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds_for_atomic_flush); + for (auto cfd : cfds_for_atomic_flush) { + bool found = false; + for (auto existing_cfd : cfds) { + if (existing_cfd == cfd) { + found = true; + break; + } + } + if (!found) { + cfd->Ref(); + cfds.push_back(cfd); + } + } + } + // Stop writes to the DB by entering both write threads WriteThread::Writer nonmem_w; if (two_write_queues_) { @@ -253,15 +298,16 @@ Status DBImpl::IngestWBWI(std::shared_ptr wbwi, // Switch memtable and add WBWIMemTables Status s; - for (size_t i = 0; i < memtables.size(); ++i) { - assert(!immutable_db_options_.atomic_flush); - // NOTE: to support atomic flush, need to call - // SelectColumnFamiliesForAtomicFlush() + for (size_t i = 0; i < cfds.size(); ++i) { WriteContext write_context; // TODO: not switch on empty memtable, may need to update metadata // like NextLogNumber(), earliest_seqno and memtable id. - s = SwitchMemtable(cfds[i], &write_context, memtables[i], - last_seqno_after_ingest); + if (i < memtables.size()) { + s = SwitchMemtable(cfds[i], &write_context, memtables[i], + last_seqno_after_ingest); + } else { + s = SwitchMemtable(cfds[i], &write_context); + } if (!s.ok()) { // SwitchMemtable() can only fail if a new WAL is to be created, this // should only happen for the first call to SwitchMemtable(). log will @@ -301,9 +347,18 @@ Status DBImpl::IngestWBWI(std::shared_ptr wbwi, continue; } cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + // TODO: a new flush reason for ingesting memtable + GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion, + &flush_req); + EnqueuePendingFlush(flush_req); + } + } + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); FlushRequest flush_req; - // TODO: a new flush reason for ingesting memtable - GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion, + GenerateFlushRequest(cfds, FlushReason::kExternalFileIngestion, &flush_req); EnqueuePendingFlush(flush_req); } @@ -314,13 +369,12 @@ Status DBImpl::IngestWBWI(std::shared_ptr wbwi, Status DBImpl::WriteImpl(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback, - UserWriteCallback* user_write_cb, uint64_t* log_used, + UserWriteCallback* user_write_cb, uint64_t* wal_used, uint64_t log_ref, bool disable_memtable, uint64_t* seq_used, size_t batch_cnt, PreReleaseCallback* pre_release_callback, PostMemTableCallback* post_memtable_callback, - std::shared_ptr wbwi, - uint64_t prep_log) { + std::shared_ptr wbwi) { assert(!seq_per_batch_ || batch_cnt != 0); assert(my_batch == nullptr || my_batch->Count() == 0 || write_options.protection_bytes_per_key == 0 || @@ -409,9 +463,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, return Status::NotSupported( "DeleteRange is not compatible with row cache."); } + // Whether the WBWI is from transaction commit or a direct write + // (IngestWriteBatchWithIndex()) + bool ingest_wbwi_for_commit = false; if (wbwi) { - assert(prep_log > 0); - // Used only in WriteCommittedTxn::CommitInternal() with no `callback`. + if (my_batch->HasCommit()) { + ingest_wbwi_for_commit = true; + assert(log_ref); + } else { + // Only supports disableWAL for directly ingesting WBWI for now. + assert(write_options.disableWAL); + } assert(!callback); if (immutable_db_options_.unordered_write) { return Status::NotSupported( @@ -421,9 +483,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, return Status::NotSupported( "Ingesting WriteBatch does not support pipelined_write"); } - if (immutable_db_options_.atomic_flush) { + if (!wbwi->GetOverwriteKey()) { return Status::NotSupported( - "Ingesting WriteBatch does not support atomic_flush"); + "WriteBatchWithIndex ingestion requires overwrite_key=true"); } } // Otherwise IsLatestPersistentState optimization does not make sense @@ -444,7 +506,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // they don't consume sequence. return WriteImplWALOnly( &nonmem_write_thread_, write_options, my_batch, callback, user_write_cb, - log_used, log_ref, seq_used, batch_cnt, pre_release_callback, + wal_used, log_ref, seq_used, batch_cnt, pre_release_callback, assign_order, kDontPublishLastSeq, disable_memtable); } @@ -458,7 +520,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // sequence in in increasing order, iii) call pre_release_callback serially Status status = WriteImplWALOnly( &write_thread_, write_options, my_batch, callback, user_write_cb, - log_used, log_ref, &seq, sub_batch_cnt, pre_release_callback, + wal_used, log_ref, &seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder, kDoPublishLastSeq, disable_memtable); TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL"); if (!status.ok()) { @@ -477,7 +539,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (immutable_db_options_.enable_pipelined_write) { return PipelinedWriteImpl(write_options, my_batch, callback, user_write_cb, - log_used, log_ref, disable_memtable, seq_used); + wal_used, log_ref, disable_memtable, seq_used); } PERF_TIMER_GUARD(write_pre_and_post_process_time); @@ -524,16 +586,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, assert(tmp_s.ok()); } } - versions_->SetLastSequence(last_sequence); - MemTableInsertStatusCheck(w.status); + if (w.status.ok()) { // Don't publish a partial batch write + versions_->SetLastSequence(last_sequence); + } else { + HandleMemTableInsertFailure(w.status); + } write_thread_.ExitAsBatchGroupFollower(&w); } assert(w.state == WriteThread::STATE_COMPLETED); // STATE_COMPLETED conditional below handles exit } if (w.state == WriteThread::STATE_COMPLETED) { - if (log_used != nullptr) { - *log_used = w.log_used; + if (wal_used != nullptr) { + *wal_used = w.wal_used; } if (seq_used != nullptr) { *seq_used = w.sequence; @@ -549,7 +614,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // when it finds suitable, and finish them in the same write batch. // This is how a write job could be done by the other writer. WriteContext write_context; - LogContext log_context(write_options.sync); + // FIXME: also check disableWAL like others? + WalContext wal_context(write_options.sync); WriteThread::WriteGroup write_group; bool in_parallel_group = false; uint64_t last_sequence = kMaxSequenceNumber; @@ -563,7 +629,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // PreprocessWrite does its own perf timing. PERF_TIMER_STOP(write_pre_and_post_process_time); - status = PreprocessWrite(write_options, &log_context, &write_context); + status = PreprocessWrite(write_options, &wal_context, &write_context); if (!two_write_queues_) { // Assign it after ::PreprocessWrite since the sequence might advance // inside it by WriteRecoverableState @@ -631,7 +697,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, continue; } // TODO: maybe handle the tracing status? - tracer_->Write(writer->batch).PermitUncheckedError(); + if (wbwi && !ingest_wbwi_for_commit) { + // for transaction write, tracer only needs the commit marker which + // is in writer->batch + tracer_->Write(wbwi->GetWriteBatch()).PermitUncheckedError(); + } else { + tracer_->Write(writer->batch).PermitUncheckedError(); + } } } } @@ -689,22 +761,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (!two_write_queues_) { if (status.ok() && !write_options.disableWAL) { - assert(log_context.log_file_number_size); - LogFileNumberSize& log_file_number_size = - *(log_context.log_file_number_size); + assert(wal_context.wal_file_number_size); + wal_context.prev_size = wal_context.writer->file()->GetFileSize(); PERF_TIMER_GUARD(write_wal_time); - io_s = - WriteToWAL(write_group, log_context.writer, log_used, - log_context.need_log_sync, log_context.need_log_dir_sync, - last_sequence + 1, log_file_number_size); + io_s = WriteGroupToWAL(write_group, wal_context.writer, wal_used, + wal_context.need_wal_sync, + wal_context.need_wal_dir_sync, last_sequence + 1, + *wal_context.wal_file_number_size); } } else { if (status.ok() && !write_options.disableWAL) { PERF_TIMER_GUARD(write_wal_time); // LastAllocatedSequence is increased inside WriteToWAL under // wal_write_mutex_ to ensure ordered events in WAL - io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, - seq_inc); + io_s = ConcurrentWriteGroupToWAL(write_group, wal_used, &last_sequence, + seq_inc); } else { // Otherwise we inc seq number for memtable writes last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); @@ -716,16 +787,16 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, last_sequence += seq_inc; // Seqno assigned to this write are [current_sequence, last_sequence] - if (log_context.need_log_sync) { + if (wal_context.need_wal_sync) { VersionEdit synced_wals; - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); if (status.ok()) { - MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync, &synced_wals); } else { - MarkLogsNotSynced(logfile_number_); + MarkLogsNotSynced(cur_wal_number_); } - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); // TODO: plumb Env::IOActivity, Env::IOPriority @@ -760,7 +831,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, writer->sequence = next_sequence; if (writer->pre_release_callback) { Status ws = writer->pre_release_callback->Callback( - writer->sequence, disable_memtable, writer->log_used, index++, + writer->sequence, disable_memtable, writer->wal_used, index++, pre_release_callback_cnt); if (!ws.ok()) { status = pre_release_cb_status = ws; @@ -785,8 +856,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, write_group, current_sequence, column_family_memtables_.get(), &flush_scheduler_, &trim_history_scheduler_, write_options.ignore_missing_column_families, - 0 /*recovery_log_number*/, this, parallel, seq_per_batch_, - batch_per_txn_); + 0 /*recovery_log_number*/, this, seq_per_batch_, batch_per_txn_); } else { write_group.last_sequence = last_sequence; write_thread_.LaunchParallelMemTableWriters(&write_group); @@ -834,12 +904,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // handle exit, false means somebody else did should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w); } - if (wbwi) { - if (status.ok() && w.status.ok()) { + if (wbwi && status.ok() && w.status.ok()) { + uint32_t wbwi_count = wbwi->GetWriteBatch()->Count(); + // skip empty batch case + if (wbwi_count) { // w.batch contains (potentially empty) commit time batch updates, // only ingest wbwi if w.batch is applied to memtable successfully uint32_t memtable_update_count = w.batch->Count(); - uint32_t wbwi_count = wbwi->GetWriteBatch()->Count(); // Seqno assigned to this write are [last_seq + 1 - seq_inc, last_seq]. // seq_inc includes w.batch (memtable updates) and wbwi // w.batch gets first `memtable_update_count` sequence numbers. @@ -852,10 +923,12 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (two_write_queues_) { assert(ub <= versions_->LastAllocatedSequence()); } - status = IngestWBWI(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub}, - prep_log, last_sequence, - /*memtable_updated=*/memtable_update_count > 0, - write_options.ignore_missing_column_families); + status = + IngestWBWIAsMemtable(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub}, + /*min_prep_log=*/log_ref, last_sequence, + /*memtable_updated=*/memtable_update_count > 0, + write_options.ignore_missing_column_families); + RecordTick(stats_, NUMBER_WBWI_INGEST); } } @@ -873,9 +946,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } // Note: if we are to resume after non-OK statuses we need to revisit how // we react to non-OK statuses here. - versions_->SetLastSequence(last_sequence); + if (w.status.ok()) { // Don't publish a partial batch write + versions_->SetLastSequence(last_sequence); + } + } + if (!w.status.ok()) { + if (wal_context.prev_size < SIZE_MAX) { + InstrumentedMutexLock l(&wal_write_mutex_); + if (logs_.back().number == wal_context.wal_file_number_size->number) { + logs_.back().SetAttemptTruncateSize(wal_context.prev_size); + } + } + HandleMemTableInsertFailure(w.status); } - MemTableInsertStatusCheck(w.status); write_thread_.ExitAsBatchGroupLeader(write_group, status); } @@ -888,7 +971,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback, UserWriteCallback* user_write_cb, - uint64_t* log_used, uint64_t log_ref, + uint64_t* wal_used, uint64_t log_ref, bool disable_memtable, uint64_t* seq_used) { PERF_TIMER_GUARD(write_pre_and_post_process_time); StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); @@ -905,10 +988,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, if (w.callback && !w.callback->AllowWriteBatching()) { write_thread_.WaitForMemTableWriters(); } - LogContext log_context(!write_options.disableWAL && write_options.sync); + WalContext wal_context(!write_options.disableWAL && write_options.sync); // PreprocessWrite does its own perf timing. PERF_TIMER_STOP(write_pre_and_post_process_time); - w.status = PreprocessWrite(write_options, &log_context, &write_context); + w.status = PreprocessWrite(write_options, &wal_context, &write_context); PERF_TIMER_START(write_pre_and_post_process_time); // This can set non-OK status if callback fail. @@ -977,13 +1060,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, wal_write_group.size - 1); RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); } - assert(log_context.log_file_number_size); - LogFileNumberSize& log_file_number_size = - *(log_context.log_file_number_size); - io_s = - WriteToWAL(wal_write_group, log_context.writer, log_used, - log_context.need_log_sync, log_context.need_log_dir_sync, - current_sequence, log_file_number_size); + assert(wal_context.wal_file_number_size); + WalFileNumberSize& wal_file_number_size = + *(wal_context.wal_file_number_size); + io_s = WriteGroupToWAL(wal_write_group, wal_context.writer, wal_used, + wal_context.need_wal_sync, + wal_context.need_wal_dir_sync, current_sequence, + wal_file_number_size); w.status = io_s; } @@ -995,13 +1078,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } VersionEdit synced_wals; - if (log_context.need_log_sync) { - InstrumentedMutexLock l(&log_write_mutex_); + if (wal_context.need_wal_sync) { + InstrumentedMutexLock l(&wal_write_mutex_); if (w.status.ok()) { - MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync, + MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync, &synced_wals); } else { - MarkLogsNotSynced(logfile_number_); + MarkLogsNotSynced(cur_wal_number_); } } if (w.status.ok() && synced_wals.IsWalAddition()) { @@ -1031,8 +1114,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, memtable_write_group, w.sequence, column_family_memtables_.get(), &flush_scheduler_, &trim_history_scheduler_, write_options.ignore_missing_column_families, 0 /*log_number*/, this, - false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_); - versions_->SetLastSequence(memtable_write_group.last_sequence); + seq_per_batch_, batch_per_txn_); + if (memtable_write_group.status + .ok()) { // Don't publish a partial batch write + versions_->SetLastSequence(memtable_write_group.last_sequence); + } else { + HandleMemTableInsertFailure(memtable_write_group.status); + } write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); } } else { @@ -1061,8 +1149,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, PERF_TIMER_START(write_pre_and_post_process_time); if (write_thread_.CompleteParallelMemTableWriter(&w)) { - MemTableInsertStatusCheck(w.status); - versions_->SetLastSequence(w.write_group->last_sequence); + if (w.status.ok()) { // Don't publish a partial batch write + versions_->SetLastSequence(w.write_group->last_sequence); + } else { + HandleMemTableInsertFailure(w.status); + } write_thread_.ExitAsMemTableWriter(&w, *w.write_group); } } @@ -1134,7 +1225,7 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, Status DBImpl::WriteImplWALOnly( WriteThread* write_thread, const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback, - UserWriteCallback* user_write_cb, uint64_t* log_used, + UserWriteCallback* user_write_cb, uint64_t* wal_used, const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, const PublishLastSeq publish_last_seq, const bool disable_memtable) { @@ -1147,8 +1238,8 @@ Status DBImpl::WriteImplWALOnly( write_thread->JoinBatchGroup(&w); assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); if (w.state == WriteThread::STATE_COMPLETED) { - if (log_used != nullptr) { - *log_used = w.log_used; + if (wal_used != nullptr) { + *wal_used = w.wal_used; } if (seq_used != nullptr) { *seq_used = w.sequence; @@ -1164,10 +1255,10 @@ Status DBImpl::WriteImplWALOnly( // TODO(myabandeh): Make preliminary checks thread-safe so we could do them // without paying the cost of obtaining the mutex. - LogContext log_context; + WalContext wal_context; WriteContext write_context; Status status = - PreprocessWrite(write_options, &log_context, &write_context); + PreprocessWrite(write_options, &wal_context, &write_context); WriteStatusCheckOnLocked(status); if (!status.ok()) { @@ -1264,8 +1355,8 @@ Status DBImpl::WriteImplWALOnly( } Status status; if (!write_options.disableWAL) { - IOStatus io_s = - ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc); + IOStatus io_s = ConcurrentWriteGroupToWAL(write_group, wal_used, + &last_sequence, seq_inc); status = io_s; // last_sequence may not be set if there is an error // This error checking and return is moved up to avoid using uninitialized @@ -1317,7 +1408,7 @@ Status DBImpl::WriteImplWALOnly( if (!writer->CallbackFailed() && writer->pre_release_callback) { assert(writer->sequence != kMaxSequenceNumber); Status ws = writer->pre_release_callback->Callback( - writer->sequence, disable_memtable, writer->log_used, index++, + writer->sequence, disable_memtable, writer->wal_used, index++, pre_release_callback_cnt); if (!ws.ok()) { status = ws; @@ -1386,24 +1477,22 @@ void DBImpl::WALIOStatusCheck(const IOStatus& io_status) { } } -void DBImpl::MemTableInsertStatusCheck(const Status& status) { - // A non-OK status here indicates that the state implied by the - // WAL has diverged from the in-memory state. This could be - // because of a corrupt write_batch (very bad), or because the - // client specified an invalid column family and didn't specify - // ignore_missing_column_families. - if (!status.ok()) { - mutex_.Lock(); - assert(!error_handler_.IsBGWorkStopped()); - error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable); - mutex_.Unlock(); - } +void DBImpl::HandleMemTableInsertFailure(const Status& status) { + assert(!status.ok()); + // A non-OK status on memtable insert indicates that the state implied by the + // WAL has diverged from the in-memory state. This could be because of a + // corrupt write_batch (very bad), or because the client specified an invalid + // column family and didn't specify ignore_missing_column_families. + mutex_.Lock(); + assert(!error_handler_.IsBGWorkStopped()); + error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable); + mutex_.Unlock(); } Status DBImpl::PreprocessWrite(const WriteOptions& write_options, - LogContext* log_context, + WalContext* wal_context, WriteContext* write_context) { - assert(write_context != nullptr && log_context != nullptr); + assert(write_context != nullptr && wal_context != nullptr); Status status; if (error_handler_.IsDBStopped()) { @@ -1413,7 +1502,8 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time); - if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) { + if (UNLIKELY(status.ok() && + wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize())) { assert(versions_); InstrumentedMutexLock l(&mutex_); const ColumnFamilySet* const column_families = @@ -1482,17 +1572,17 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, WriteBufferManagerStallWrites(); } } - InstrumentedMutexLock l(&log_write_mutex_); - if (status.ok() && log_context->need_log_sync) { + InstrumentedMutexLock l(&wal_write_mutex_); + if (status.ok() && wal_context->need_wal_sync) { // Wait until the parallel syncs are finished. Any sync process has to sync // the front log too so it is enough to check the status of front() - // We do a while loop since log_sync_cv_ is signalled when any sync is + // We do a while loop since wal_sync_cv_ is signalled when any sync is // finished // Note: there does not seem to be a reason to wait for parallel sync at // this early step but it is not important since parallel sync (SyncWAL) and - // need_log_sync are usually not used together. + // need_wal_sync are usually not used together. while (logs_.front().IsSyncing()) { - log_sync_cv_.Wait(); + wal_sync_cv_.Wait(); } for (auto& log : logs_) { // This is just to prevent the logs to be synced by a parallel SyncWAL @@ -1503,12 +1593,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, log.PrepareForSync(); } } else { - log_context->need_log_sync = false; + wal_context->need_wal_sync = false; } - log_context->writer = logs_.back().writer; - log_context->need_log_dir_sync = - log_context->need_log_dir_sync && !log_dir_synced_; - log_context->log_file_number_size = std::addressof(alive_log_files_.back()); + wal_context->writer = logs_.back().writer; + wal_context->need_wal_dir_sync = + wal_context->need_wal_dir_sync && !wal_dir_synced_; + wal_context->wal_file_number_size = std::addressof(alive_wal_files_.back()); return status; } @@ -1559,12 +1649,12 @@ Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group, } // When two_write_queues_ is disabled, this function is called from the only -// write thread. Otherwise this must be called holding log_write_mutex_. +// write thread. Otherwise this must be called holding wal_write_mutex_. IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, const WriteOptions& write_options, - log::Writer* log_writer, uint64_t* log_used, + log::Writer* log_writer, uint64_t* wal_used, uint64_t* log_size, - LogFileNumberSize& log_file_number_size, + WalFileNumberSize& wal_file_number_size, SequenceNumber sequence) { assert(log_size != nullptr); @@ -1576,7 +1666,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, } *log_size = log_entry.size(); // When two_write_queues_ WriteToWAL has to be protected from concurretn calls - // from the two queues anyway and log_write_mutex_ is already held. Otherwise + // from the two queues anyway and wal_write_mutex_ is already held. Otherwise // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord // from possible concurrent calls via the FlushWAL by the application. const bool needs_locking = manual_wal_flush_ && !two_write_queues_; @@ -1584,7 +1674,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case // when we do not need any locking. if (UNLIKELY(needs_locking)) { - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); } IOStatus io_s = log_writer->MaybeAddUserDefinedTimestampSizeRecord( write_options, versions_->GetColumnFamiliesTimestampSizeForRecord()); @@ -1594,23 +1684,24 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, io_s = log_writer->AddRecord(write_options, log_entry, sequence); if (UNLIKELY(needs_locking)) { - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); } - if (log_used != nullptr) { - *log_used = logfile_number_; + if (wal_used != nullptr) { + *wal_used = cur_wal_number_; + assert(*wal_used == wal_file_number_size.number); } - total_log_size_ += log_entry.size(); - log_file_number_size.AddSize(*log_size); - log_empty_ = false; + wals_total_size_.FetchAddRelaxed(log_entry.size()); + wal_file_number_size.AddSize(*log_size); + wal_empty_ = false; return io_s; } -IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, - log::Writer* log_writer, uint64_t* log_used, - bool need_log_sync, bool need_log_dir_sync, - SequenceNumber sequence, - LogFileNumberSize& log_file_number_size) { +IOStatus DBImpl::WriteGroupToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* wal_used, + bool need_wal_sync, bool need_wal_dir_sync, + SequenceNumber sequence, + WalFileNumberSize& wal_file_number_size) { IOStatus io_s; assert(!two_write_queues_); assert(!write_group.leader->disable_wal); @@ -1625,10 +1716,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, } if (merged_batch == write_group.leader->batch) { - write_group.leader->log_used = logfile_number_; + write_group.leader->wal_used = cur_wal_number_; } else if (write_with_wal > 1) { for (auto writer : write_group) { - writer->log_used = logfile_number_; + writer->wal_used = cur_wal_number_; } } @@ -1640,14 +1731,14 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, WriteOptions write_options; write_options.rate_limiter_priority = write_group.leader->rate_limiter_priority; - io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used, - &log_size, log_file_number_size, sequence); + io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used, + &log_size, wal_file_number_size, sequence); if (to_be_cached_state) { cached_recoverable_state_ = *to_be_cached_state; cached_recoverable_state_empty_ = false; } - if (io_s.ok() && need_log_sync) { + if (io_s.ok() && need_wal_sync) { StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS); // It's safe to access logs_ with unlocked mutex_ here because: // - we've set getting_synced=true for all logs, @@ -1657,15 +1748,15 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, // - as long as other threads don't modify it, it's safe to read // from std::deque from multiple threads concurrently. // - // Sync operation should work with locked log_write_mutex_, because: + // Sync operation should work with locked wal_write_mutex_, because: // when DBOptions.manual_wal_flush_ is set, // FlushWAL function will be invoked by another thread. - // if without locked log_write_mutex_, the log file may get data + // if without locked wal_write_mutex_, the log file may get data // corruption const bool needs_locking = manual_wal_flush_ && !two_write_queues_; if (UNLIKELY(needs_locking)) { - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); } if (io_s.ok()) { @@ -1688,10 +1779,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, } if (UNLIKELY(needs_locking)) { - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); } - if (io_s.ok() && need_log_dir_sync) { + if (io_s.ok() && need_wal_dir_sync) { // We only sync WAL directory the first time WAL syncing is // requested, so that in case users never turn on WAL sync, // we can avoid the disk I/O in the write code path. @@ -1706,7 +1797,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, } if (io_s.ok()) { auto stats = default_cf_internal_stats_; - if (need_log_sync) { + if (need_wal_sync) { stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1); RecordTick(stats_, WAL_FILE_SYNCED); } @@ -1723,8 +1814,8 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, return io_s; } -IOStatus DBImpl::ConcurrentWriteToWAL( - const WriteThread::WriteGroup& write_group, uint64_t* log_used, +IOStatus DBImpl::ConcurrentWriteGroupToWAL( + const WriteThread::WriteGroup& write_group, uint64_t* wal_used, SequenceNumber* last_sequence, size_t seq_inc) { IOStatus io_s; @@ -1741,14 +1832,14 @@ IOStatus DBImpl::ConcurrentWriteToWAL( return io_s; } - // We need to lock log_write_mutex_ since logs_ and alive_log_files might be + // We need to lock wal_write_mutex_ since logs_ and alive_wal_files might be // pushed back concurrently - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); if (merged_batch == write_group.leader->batch) { - write_group.leader->log_used = logfile_number_; + write_group.leader->wal_used = cur_wal_number_; } else if (write_with_wal > 1) { for (auto writer : write_group) { - writer->log_used = logfile_number_; + writer->wal_used = cur_wal_number_; } } *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); @@ -1756,9 +1847,9 @@ IOStatus DBImpl::ConcurrentWriteToWAL( WriteBatchInternal::SetSequence(merged_batch, sequence); log::Writer* log_writer = logs_.back().writer; - LogFileNumberSize& log_file_number_size = alive_log_files_.back(); + WalFileNumberSize& wal_file_number_size = alive_wal_files_.back(); - assert(log_writer->get_log_number() == log_file_number_size.number); + assert(log_writer->get_log_number() == wal_file_number_size.number); uint64_t log_size; @@ -1766,13 +1857,13 @@ IOStatus DBImpl::ConcurrentWriteToWAL( WriteOptions write_options; write_options.rate_limiter_priority = write_group.leader->rate_limiter_priority; - io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used, - &log_size, log_file_number_size, sequence); + io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used, + &log_size, wal_file_number_size, sequence); if (to_be_cached_state) { cached_recoverable_state_ = *to_be_cached_state; cached_recoverable_state_empty_ = false; } - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); if (io_s.ok()) { const bool concurrent = true; @@ -1800,7 +1891,7 @@ Status DBImpl::WriteRecoverableState() { bool dont_care_bool; SequenceNumber next_seq; if (two_write_queues_) { - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); } SequenceNumber seq; if (two_write_queues_) { @@ -1815,13 +1906,17 @@ Status DBImpl::WriteRecoverableState() { 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */, &next_seq, &dont_care_bool, seq_per_batch_); auto last_seq = next_seq - 1; - if (two_write_queues_) { - versions_->FetchAddLastAllocatedSequence(last_seq - seq); - versions_->SetLastPublishedSequence(last_seq); + if (status.ok()) { // Don't publish a partial batch write + if (two_write_queues_) { + versions_->FetchAddLastAllocatedSequence(last_seq - seq); + versions_->SetLastPublishedSequence(last_seq); + } + versions_->SetLastSequence(last_seq); + } else { + HandleMemTableInsertFailure(status); } - versions_->SetLastSequence(last_seq); if (two_write_queues_) { - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); } if (status.ok() && recoverable_state_pre_release_callback_) { const bool DISABLE_MEMTABLE = true; @@ -1893,7 +1988,10 @@ void DBImpl::AssignAtomicFlushSeq(const autovector& cfds) { assert(immutable_db_options_.atomic_flush); auto seq = versions_->LastSequence(); for (auto cfd : cfds) { - cfd->imm()->AssignAtomicFlushSeq(seq); + // cfd can be nullptr, see ScheduleFlushes() + if (cfd) { + cfd->imm()->AssignAtomicFlushSeq(seq); + } } } @@ -1902,11 +2000,11 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) { assert(write_context != nullptr); Status status; - if (alive_log_files_.begin()->getting_flushed) { + if (alive_wal_files_.begin()->getting_flushed) { return status; } - auto oldest_alive_log = alive_log_files_.begin()->number; + auto oldest_alive_log = alive_wal_files_.begin()->number; bool flush_wont_release_oldest_log = false; if (allow_2pc()) { auto oldest_log_with_uncommitted_prep = @@ -1936,14 +2034,14 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) { // transactions then we cannot flush this log until those transactions are // commited. unable_to_release_oldest_log_ = false; - alive_log_files_.begin()->getting_flushed = true; + alive_wal_files_.begin()->getting_flushed = true; } ROCKS_LOG_INFO( immutable_db_options_.info_log, "Flushing all column families with data in WAL number %" PRIu64 ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, - oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize()); + oldest_alive_log, wals_total_size_.LoadRelaxed(), GetMaxTotalWalSize()); // no need to refcount because drop is happening in write thread, so can't // happen while we're in the write thread autovector cfds; @@ -2413,21 +2511,21 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, // Do this without holding the dbmutex lock. assert(versions_->prev_log_number() == 0); if (two_write_queues_) { - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); } - bool creating_new_log = !log_empty_; + bool creating_new_log = !wal_empty_; if (two_write_queues_) { - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); } uint64_t recycle_log_number = 0; // If file deletion is disabled, don't recycle logs since it'll result in // the file getting renamed if (creating_new_log && immutable_db_options_.recycle_log_file_num && - !log_recycle_files_.empty() && IsFileDeletionsEnabled()) { - recycle_log_number = log_recycle_files_.front(); + !wal_recycle_files_.empty() && IsFileDeletionsEnabled()) { + recycle_log_number = wal_recycle_files_.front(); } uint64_t new_log_number = - creating_new_log ? versions_->NewFileNumber() : logfile_number_; + creating_new_log ? versions_->NewFileNumber() : cur_wal_number_; // For use outside of holding DB mutex const MutableCFOptions mutable_cf_options_copy = cfd->GetLatestMutableCFOptions(); @@ -2453,14 +2551,14 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, mutex_.Unlock(); if (creating_new_log) { PredecessorWALInfo info; - log_write_mutex_.Lock(); + wal_write_mutex_.Lock(); if (!logs_.empty()) { log::Writer* cur_log_writer = logs_.back().writer; info = PredecessorWALInfo(cur_log_writer->get_log_number(), cur_log_writer->file()->GetFileSize(), cur_log_writer->GetLastSeqnoRecorded()); } - log_write_mutex_.Unlock(); + wal_write_mutex_.Unlock(); // TODO: Write buffer size passed in should be max of all CF's instead // of mutable_cf_options.write_buffer_size. io_s = CreateWAL(write_options, new_log_number, recycle_log_number, @@ -2501,11 +2599,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, // concurrent full purges don't delete the file while we're recycling it. // To achieve that we hold the old log number in the recyclable list until // after it has been renamed. - assert(log_recycle_files_.front() == recycle_log_number); - log_recycle_files_.pop_front(); + assert(wal_recycle_files_.front() == recycle_log_number); + wal_recycle_files_.pop_front(); } if (s.ok() && creating_new_log) { - InstrumentedMutexLock l(&log_write_mutex_); + InstrumentedMutexLock l(&wal_write_mutex_); assert(new_log != nullptr); if (!logs_.empty()) { // Alway flush the buffer of the last log before switching to a new one @@ -2527,11 +2625,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, } } if (s.ok()) { - logfile_number_ = new_log_number; - log_empty_ = true; - log_dir_synced_ = false; - logs_.emplace_back(logfile_number_, new_log); - alive_log_files_.emplace_back(logfile_number_); + cur_wal_number_ = new_log_number; + wal_empty_ = true; + wal_dir_synced_ = false; + logs_.emplace_back(cur_wal_number_, new_log); + alive_wal_files_.emplace_back(cur_wal_number_); } } @@ -2562,7 +2660,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, // obsolete. So we should track the WAL obsoletion event before actually // updating the empty CF's log number. uint64_t min_wal_number_to_keep = - versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_); + versions_->PreComputeMinLogNumberWithUnflushedData(cur_wal_number_); if (min_wal_number_to_keep > versions_->GetWalSet().GetMinWalNumberToKeep()) { // TODO: plumb Env::IOActivity, Env::IOPriority @@ -2597,7 +2695,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, for (auto cf : empty_cfs) { if (cf->IsEmpty()) { - cf->SetLogNumber(logfile_number_); + cf->SetLogNumber(cur_wal_number_); // MEMPURGE: No need to change this, because new adds // should still receive new sequence numbers. cf->mem()->SetCreationSeq(versions_->LastSequence()); @@ -2614,14 +2712,14 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, // advance the log number. no need to persist this in the manifest if (cf->IsEmpty()) { if (creating_new_log) { - cf->SetLogNumber(logfile_number_); + cf->SetLogNumber(cur_wal_number_); } cf->mem()->SetCreationSeq(versions_->LastSequence()); } } } - cfd->mem()->SetNextLogNumber(logfile_number_); + cfd->mem()->SetNextLogNumber(cur_wal_number_); assert(new_mem != nullptr); cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); if (new_imm) { @@ -2633,7 +2731,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, // we always try to flush all immutable memtable. For atomic flush, these // two memtables will be marked eligible for flush in the same call to // AssignAtomicFlushSeq(). - new_imm->SetNextLogNumber(logfile_number_); + new_imm->SetNextLogNumber(cur_wal_number_); cfd->imm()->Add(new_imm, &context->memtables_to_free_); } new_mem->Ref(); diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc index ecef6e860aba..4021ea73d30a 100644 --- a/db/db_io_failure_test.cc +++ b/db/db_io_failure_test.cc @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + #include "db/db_test_util.h" #include "port/stack_trace.h" #include "test_util/testutil.h" diff --git a/db/db_iter.cc b/db/db_iter.cc index c5a099103653..bd8f179655a6 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -9,7 +9,6 @@ #include "db/db_iter.h" -#include #include #include @@ -24,6 +23,7 @@ #include "memory/arena.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/env.h" +#include "rocksdb/io_dispatcher.h" #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" #include "rocksdb/options.h" @@ -42,9 +42,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, const MutableCFOptions& mutable_cf_options, const Comparator* cmp, InternalIterator* iter, const Version* version, SequenceNumber s, bool arena_mode, - uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, - bool expose_blob_index) + bool expose_blob_index, ReadOnlyMemTable* active_mem) : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), env_(_env), clock_(ioptions.clock), @@ -58,11 +57,21 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, read_callback_(read_callback), sequence_(s), statistics_(ioptions.stats), - max_skip_(max_sequential_skip_in_iterations), + max_skip_(mutable_cf_options.max_sequential_skip_in_iterations), max_skippable_internal_keys_(read_options.max_skippable_internal_keys), num_internal_keys_skipped_(0), iterate_lower_bound_(read_options.iterate_lower_bound), iterate_upper_bound_(read_options.iterate_upper_bound), + cfh_(cfh), + timestamp_ub_(read_options.timestamp), + timestamp_lb_(read_options.iter_start_ts), + timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0), + active_mem_(active_mem), + memtable_seqno_lb_(kMaxSequenceNumber), + memtable_op_scan_flush_trigger_(0), + avg_op_scan_flush_trigger_(0), + iter_step_since_seek_(1), + mem_hidden_op_scanned_since_seek_(0), direction_(kForward), valid_(false), current_entry_is_merged_(false), @@ -76,11 +85,7 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, expose_blob_index_(expose_blob_index), allow_unprepared_value_(read_options.allow_unprepared_value), is_blob_(false), - arena_mode_(arena_mode), - cfh_(cfh), - timestamp_ub_(read_options.timestamp), - timestamp_lb_(read_options.iter_start_ts), - timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) { + arena_mode_(arena_mode) { RecordTick(statistics_, NO_ITERATOR_CREATED); if (pin_thru_lifetime_) { pinned_iters_mgr_.StartPinning(); @@ -94,6 +99,25 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, // prefix_seek_opt_in_only should force total_order_seek whereever the caller // is duplicating the original ReadOptions assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek); + if (active_mem_) { + // FIXME: GetEarliestSequenceNumber() may return a seqno that is one smaller + // than the smallest seqno in the memtable. This violates its comment and + // entries with that seqno may not be in the active memtable. Before it's + // fixed, we use GetFirstSequenceNumber() for more accurate result. + memtable_seqno_lb_ = active_mem_->IsEmpty() + ? active_mem_->GetEarliestSequenceNumber() + : active_mem_->GetFirstSequenceNumber(); + memtable_op_scan_flush_trigger_ = + mutable_cf_options.memtable_op_scan_flush_trigger; + if (memtable_op_scan_flush_trigger_) { + // avg_op_scan_flush_trigger_ requires memtable_op_scan_flush_trigger_ > 0 + avg_op_scan_flush_trigger_ = + mutable_cf_options.memtable_avg_op_scan_flush_trigger; + } + } else { + // memtable_op_scan_flush_trigger_ and avg_op_scan_flush_trigger_ are + // initialized to 0(disabled) as default. + } } Status DBIter::GetProperty(std::string prop_name, std::string* prop) { @@ -155,6 +179,7 @@ void DBIter::Next() { local_stats_.skip_count_ += num_internal_keys_skipped_; local_stats_.skip_count_--; num_internal_keys_skipped_ = 0; + iter_step_since_seek_++; bool ok = true; if (direction_ == kReverse) { is_key_seqnum_zero_ = false; @@ -369,6 +394,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, // to one. bool reseek_done = false; + uint64_t mem_hidden_op_scanned = 0; do { // Will update is_key_seqnum_zero_ as soon as we parsed the current key // but we need to save the previous value to be used in the loop. @@ -425,6 +451,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) { num_skipped++; // skip this entry PERF_COUNTER_ADD(internal_key_skipped_count, 1); + MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned); } else { assert(!skipping_saved_key || CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0); @@ -446,6 +473,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, !iter_.iter()->IsKeyPinned() /* copy */); skipping_saved_key = true; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned); } break; case kTypeValue: @@ -484,7 +512,6 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, valid_ = true; return true; - break; case kTypeMerge: if (!PrepareValueInternal()) { return false; @@ -496,7 +523,6 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, current_entry_is_merged_ = true; valid_ = true; return MergeValuesNewToOld(); // Go to a different state machine - break; default: valid_ = false; status_ = Status::Corruption( @@ -1097,7 +1123,6 @@ bool DBIter::FindValueForCurrentKey() { } return true; } - break; case kTypeValue: case kTypeValuePreferredSeqno: SetValueAndColumnsFromPlain(pinned_value_); @@ -1224,6 +1249,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (timestamp_lb_ != nullptr) { saved_key_.SetInternalKey(ikey); + } else { + saved_key_.SetUserKey(ikey.user_key); } valid_ = true; @@ -1539,11 +1566,123 @@ void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) { } } +Status DBIter::ValidateScanOptions(const MultiScanArgs& multiscan_opts) const { + if (multiscan_opts.empty()) { + return Status::InvalidArgument("Empty MultiScanArgs"); + } + + const std::vector& scan_opts = multiscan_opts.GetScanRanges(); + const bool has_limit = scan_opts.front().range.limit.has_value(); + if (!has_limit && scan_opts.size() > 1) { + return Status::InvalidArgument("Scan has no upper bound"); + } + + for (size_t i = 0; i < scan_opts.size(); ++i) { + const auto& scan_range = scan_opts[i].range; + if (!scan_range.start.has_value()) { + return Status::InvalidArgument("Scan has no start key at index " + + std::to_string(i)); + } + + if (scan_range.limit.has_value()) { + if (user_comparator_.CompareWithoutTimestamp( + scan_range.start.value(), /*a_has_ts=*/false, + scan_range.limit.value(), /*b_has_ts=*/false) >= 0) { + return Status::InvalidArgument( + "Scan start key is large or equal than limit at index " + + std::to_string(i)); + } + } + + if (i > 0) { + if (!scan_range.limit.has_value()) { + // multiple scan without limit scan ranges + return Status::InvalidArgument("Scan has no upper bound at index " + + std::to_string(i)); + } + + const auto& last_end_key = scan_opts[i - 1].range.limit.value(); + if (user_comparator_.CompareWithoutTimestamp( + scan_range.start.value(), /*a_has_ts=*/false, last_end_key, + /*b_has_ts=*/false) < 0) { + return Status::InvalidArgument("Overlapping ranges at index " + + std::to_string(i)); + } + } + } + return Status::OK(); +} + +void DBIter::Prepare(const MultiScanArgs& scan_opts) { + status_ = ValidateScanOptions(scan_opts); + if (!status_.ok()) { + return; + } + std::optional new_scan_opts; + new_scan_opts.emplace(scan_opts); + scan_opts_.swap(new_scan_opts); + scan_index_ = 0; + + // Create a shared IODispatcher if not provided. This allows all + // BlockBasedTableIterators in this scan to share a single dispatcher, + // enabling better IO coordination and future rate limiting. + if (!scan_opts_.value().io_dispatcher) { + scan_opts_->io_dispatcher.reset(NewIODispatcher()); + } + + if (!scan_opts.empty()) { + iter_.Prepare(&scan_opts_.value()); + } else { + iter_.Prepare(nullptr); + } +} + void DBIter::Seek(const Slice& target) { PERF_COUNTER_ADD(iter_seek_count, 1); PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); StopWatch sw(clock_, statistics_, DB_SEEK); + if (scan_opts_.has_value()) { + // Validate the seek target is as expected in the previously prepared range + auto const& scan_ranges = scan_opts_.value().GetScanRanges(); + if (scan_index_ >= scan_ranges.size()) { + status_ = Status::InvalidArgument( + "Seek called after exhausting all of the scan ranges"); + valid_ = false; + return; + } + + // Validate start key of next prepare range matches the seek target + auto const& range = scan_ranges[scan_index_]; + auto const& start = range.range.start; + assert(start.has_value()); + if (user_comparator_.CompareWithoutTimestamp(target, *start) != 0) { + status_ = Status::InvalidArgument( + "Seek target does not match the start of the next prepared range at " + "index " + + std::to_string(scan_index_)); + valid_ = false; + return; + } + + // validate the upper bound is set to the same value of limit, if limit + // exists + auto const& limit = range.range.limit; + if (limit.has_value()) { + if (iterate_upper_bound_ == nullptr || + user_comparator_.CompareWithoutTimestamp( + limit.value(), *iterate_upper_bound_) != 0) { + status_ = Status::InvalidArgument( + "Upper bound is not set to the same limit value of the next " + "prepared range at index " + + std::to_string(scan_index_)); + valid_ = false; + return; + } + } + scan_index_++; + } + if (cfh_ != nullptr) { // TODO: What do we do if this returns an error? Slice lower_bound, upper_bound; @@ -1568,6 +1707,7 @@ void DBIter::Seek(const Slice& target) { ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); + MarkMemtableForFlushForAvgTrigger(); // Seek the inner iterator based on the target key. { @@ -1644,6 +1784,7 @@ void DBIter::SeekForPrev(const Slice& target) { ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); + MarkMemtableForFlushForAvgTrigger(); // Seek the inner iterator based on the target key. { @@ -1705,6 +1846,7 @@ void DBIter::SeekToFirst() { ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); + MarkMemtableForFlushForAvgTrigger(); ClearSavedValue(); is_key_seqnum_zero_ = false; @@ -1768,6 +1910,7 @@ void DBIter::SeekToLast() { ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); + MarkMemtableForFlushForAvgTrigger(); ClearSavedValue(); is_key_seqnum_zero_ = false; @@ -1790,21 +1933,4 @@ void DBIter::SeekToLast() { StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); } } - -Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, - const ImmutableOptions& ioptions, - const MutableCFOptions& mutable_cf_options, - const Comparator* user_key_comparator, - InternalIterator* internal_iter, const Version* version, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, - ColumnFamilyHandleImpl* cfh, bool expose_blob_index) { - DBIter* db_iter = new DBIter( - env, read_options, ioptions, mutable_cf_options, user_key_comparator, - internal_iter, version, sequence, false, - max_sequential_skip_in_iterations, read_callback, cfh, expose_blob_index); - return db_iter; -} - } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_iter.h b/db/db_iter.h index 084ed80d41a0..575dc455eedc 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -12,7 +12,6 @@ #include #include "db/db_impl/db_impl.h" -#include "db/range_del_aggregator.h" #include "memory/arena.h" #include "options/cf_options.h" #include "rocksdb/db.h" @@ -57,6 +56,34 @@ class Version; // numbers, deletion markers, overwrites, etc. class DBIter final : public Iterator { public: + // Return a new DBIter that reads from `internal_iter` at the specified + // `sequence` number. + // + // @param active_mem Pointer to the active memtable that `internal_iter` + // is reading from. If not null, the memtable can be marked for flush + // according to options mutable_cf_options.memtable_op_scan_flush_trigger + // and mutable_cf_options.memtable_avg_op_scan_flush_trigger. + // @param arena_mode If true, the DBIter will be allocated from the arena. + static DBIter* NewIter(Env* env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const Comparator* user_key_comparator, + InternalIterator* internal_iter, + const Version* version, const SequenceNumber& sequence, + ReadCallback* read_callback, + ReadOnlyMemTable* active_mem, + ColumnFamilyHandleImpl* cfh = nullptr, + bool expose_blob_index = false, + Arena* arena = nullptr) { + void* mem = arena ? arena->AllocateAligned(sizeof(DBIter)) + : operator new(sizeof(DBIter)); + DBIter* db_iter = new (mem) + DBIter(env, read_options, ioptions, mutable_cf_options, + user_key_comparator, internal_iter, version, sequence, arena, + read_callback, cfh, expose_blob_index, active_mem); + return db_iter; + } + // The following is grossly complicated. TODO: clean it up // Which direction is the iterator currently moving? // (1) When moving forward: @@ -113,19 +140,12 @@ class DBIter final : public Iterator { uint64_t skip_count_; }; - DBIter(Env* _env, const ReadOptions& read_options, - const ImmutableOptions& ioptions, - const MutableCFOptions& mutable_cf_options, const Comparator* cmp, - InternalIterator* iter, const Version* version, SequenceNumber s, - bool arena_mode, uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, - bool expose_blob_index); - // No copying allowed DBIter(const DBIter&) = delete; void operator=(const DBIter&) = delete; ~DBIter() override { + MarkMemtableForFlushForAvgTrigger(); ThreadStatus::OperationType cur_op_type = ThreadStatusUtil::GetThreadOperation(); ThreadStatusUtil::SetThreadOperation( @@ -220,7 +240,18 @@ class DBIter final : public Iterator { bool PrepareValue() override; + void Prepare(const MultiScanArgs& scan_opts) override; + Status ValidateScanOptions(const MultiScanArgs& multiscan_opts) const; + private: + DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Comparator* cmp, + InternalIterator* iter, const Version* version, SequenceNumber s, + bool arena_mode, ReadCallback* read_callback, + ColumnFamilyHandleImpl* cfh, bool expose_blob_index, + ReadOnlyMemTable* active_mem); + class BlobReader { public: BlobReader(const Version* version, ReadTier read_tier, @@ -379,6 +410,36 @@ class DBIter final : public Iterator { return true; } + void MarkMemtableForFlushForAvgTrigger() { + if (avg_op_scan_flush_trigger_ && + mem_hidden_op_scanned_since_seek_ >= memtable_op_scan_flush_trigger_ && + mem_hidden_op_scanned_since_seek_ >= + static_cast(iter_step_since_seek_) * + avg_op_scan_flush_trigger_) { + assert(memtable_op_scan_flush_trigger_ > 0); + active_mem_->MarkForFlush(); + avg_op_scan_flush_trigger_ = 0; + memtable_op_scan_flush_trigger_ = 0; + } + iter_step_since_seek_ = 1; + mem_hidden_op_scanned_since_seek_ = 0; + } + + void MarkMemtableForFlushForPerOpTrigger(uint64_t& mem_hidden_op_scanned) { + if (memtable_op_scan_flush_trigger_ && + ikey_.sequence >= memtable_seqno_lb_) { + if (++mem_hidden_op_scanned >= memtable_op_scan_flush_trigger_) { + active_mem_->MarkForFlush(); + // Turn off the flush trigger checks. + memtable_op_scan_flush_trigger_ = 0; + avg_op_scan_flush_trigger_ = 0; + } + if (avg_op_scan_flush_trigger_) { + ++mem_hidden_op_scanned_since_seek_; + } + } + } + const SliceTransform* prefix_extractor_; Env* const env_; SystemClock* clock_; @@ -425,6 +486,25 @@ class DBIter final : public Iterator { IterKey prefix_; Status status_; + Slice lazy_blob_index_; + + // List of operands for merge operator. + MergeContext merge_context_; + LocalStatistics local_stats_; + PinnedIteratorsManager pinned_iters_mgr_; + ColumnFamilyHandleImpl* cfh_; + const Slice* const timestamp_ub_; + const Slice* const timestamp_lb_; + const size_t timestamp_size_; + std::string saved_timestamp_; + std::optional scan_opts_; + size_t scan_index_{0}; + ReadOnlyMemTable* const active_mem_; + SequenceNumber memtable_seqno_lb_; + uint32_t memtable_op_scan_flush_trigger_; + uint32_t avg_op_scan_flush_trigger_; + uint32_t iter_step_since_seek_; + uint32_t mem_hidden_op_scanned_since_seek_; Direction direction_; bool valid_; bool current_entry_is_merged_; @@ -443,29 +523,7 @@ class DBIter final : public Iterator { // the stacked BlobDB implementation is used, false otherwise. bool expose_blob_index_; bool allow_unprepared_value_; - Slice lazy_blob_index_; bool is_blob_; bool arena_mode_; - // List of operands for merge operator. - MergeContext merge_context_; - LocalStatistics local_stats_; - PinnedIteratorsManager pinned_iters_mgr_; - ColumnFamilyHandleImpl* cfh_; - const Slice* const timestamp_ub_; - const Slice* const timestamp_lb_; - const size_t timestamp_size_; - std::string saved_timestamp_; }; - -// Return a new iterator that converts internal keys (yielded by -// "*internal_iter") that were live at the specified `sequence` number -// into appropriate user keys. -Iterator* NewDBIterator( - Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, - const MutableCFOptions& mutable_cf_options, - const Comparator* user_key_comparator, InternalIterator* internal_iter, - const Version* version, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, - ColumnFamilyHandleImpl* cfh = nullptr, bool expose_blob_index = false); - } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc index daecbcc7acb6..c6d3936b3ccf 100644 --- a/db/db_iter_stress_test.cc +++ b/db/db_iter_stress_test.cc @@ -528,12 +528,11 @@ TEST_F(DBIteratorStressTest, StressTest) { internal_iter->target_hidden_fraction = target_hidden_fraction; internal_iter->trace = trace; - db_iter.reset(NewDBIterator( + db_iter.reset(DBIter::NewIter( env_, ropt, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), - internal_iter, nullptr /* version */, sequence, - options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + internal_iter, /*version=*/nullptr, sequence, + nullptr /*read_callback*/, /*active_mem=*/nullptr)); } // Do a random operation. It's important to do it on ref_it diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index cf8321808f9f..d18aa0bac4a1 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -259,11 +259,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); ReadOptions ro; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -294,11 +293,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); ReadOptions ro; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -322,11 +320,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; ro.iterate_upper_bound = &prefix; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -356,11 +353,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; ro.iterate_upper_bound = &prefix; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -393,11 +389,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; ro.iterate_upper_bound = &prefix; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -425,11 +420,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; ro.iterate_upper_bound = &prefix; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 7 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -465,11 +459,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; ro.iterate_upper_bound = &prefix; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 4 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -492,11 +485,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; ro.iterate_upper_bound = &prefix; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -517,11 +509,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; ro.iterate_upper_bound = &prefix; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -554,11 +545,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; ro.iterate_upper_bound = &prefix; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 7 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -586,11 +576,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); ReadOptions ro; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -631,11 +620,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); ReadOptions ro; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 2 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -664,11 +652,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); ReadOptions ro; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -696,11 +683,10 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 0 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); ASSERT_OK(db_iter->status()); @@ -710,11 +696,10 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 0 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); ASSERT_OK(db_iter->status()); @@ -735,11 +720,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) { } internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 2 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -782,11 +766,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, i + 2 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -820,11 +804,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, i + 2 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -851,11 +835,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 202 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -886,11 +869,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { } internal_iter->AddPut("c", "200"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, i /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); ASSERT_OK(db_iter->status()); @@ -906,11 +889,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { } internal_iter->AddPut("c", "200"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 200 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -944,11 +926,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { } internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, i + 2 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -981,11 +963,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { } internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, i + 2 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1033,11 +1015,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { internal_iter->Finish(); ro.max_skippable_internal_keys = 0; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1081,11 +1062,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { internal_iter->Finish(); ro.max_skippable_internal_keys = 2; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1127,11 +1107,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { internal_iter->Finish(); ro.max_skippable_internal_keys = 2; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1167,11 +1146,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { internal_iter->Finish(); ro.max_skippable_internal_keys = 2; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1204,11 +1182,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { internal_iter->Finish(); ro.max_skippable_internal_keys = 2; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1236,11 +1213,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { internal_iter->Finish(); ro.max_skippable_internal_keys = 2; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1275,11 +1251,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { internal_iter->Finish(); ro.max_skippable_internal_keys = 2; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1314,11 +1289,11 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { internal_iter->Finish(); ro.max_skippable_internal_keys = i; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1369,11 +1344,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { options.max_sequential_skip_in_iterations = 1000; ro.max_skippable_internal_keys = i; - std::unique_ptr db_iter(NewDBIterator( - env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + std::unique_ptr db_iter(DBIter::NewIter( + env_, ro, ioptions, MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1412,11 +1386,11 @@ TEST_F(DBIteratorTest, DBIteratorTimedPutBasic) { internal_iter->AddTimedPut("d", "3", /*write_unix_time=*/0); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + options.max_sequential_skip_in_iterations = 1; + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 7 /* sequence */, /*max_sequential_skip_in_iterations*/ 1, - nullptr /* read_callback */)); + 7 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1463,11 +1437,10 @@ TEST_F(DBIteratorTest, DBIterator1) { internal_iter->AddMerge("b", "2"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 1 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 1 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1493,11 +1466,10 @@ TEST_F(DBIteratorTest, DBIterator2) { internal_iter->AddMerge("b", "2"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 0 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 0 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1519,11 +1491,10 @@ TEST_F(DBIteratorTest, DBIterator3) { internal_iter->AddMerge("b", "2"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 2 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1545,11 +1516,10 @@ TEST_F(DBIteratorTest, DBIterator4) { internal_iter->AddMerge("b", "2"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 4 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 4 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1580,11 +1550,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 0 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1605,11 +1574,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 1 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1630,11 +1598,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 2 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1655,11 +1622,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 3 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1680,11 +1646,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 4 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1705,11 +1670,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 5 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1730,11 +1694,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 6 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1753,11 +1716,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddMerge("a", "merge_2"); internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 10 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -1785,11 +1747,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 0 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1810,11 +1771,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 1 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1835,11 +1795,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 2 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1860,11 +1819,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 3 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); ASSERT_OK(db_iter->status()); @@ -1881,11 +1839,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 4 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1906,11 +1863,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 5 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1931,11 +1887,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 6 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1976,11 +1931,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 0 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -2013,11 +1967,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 2 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2056,11 +2009,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 4 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2099,11 +2051,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 5 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2147,11 +2098,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 6 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2196,11 +2146,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 7 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2239,11 +2188,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 9 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2288,11 +2236,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 13 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2338,11 +2285,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), internal_iter, nullptr /* version */, 14 /* sequence */, - options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2371,11 +2317,10 @@ TEST_F(DBIteratorTest, DBIterator8) { internal_iter->AddPut("b", "0"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -2403,11 +2348,11 @@ TEST_F(DBIteratorTest, DBIterator9) { internal_iter->AddMerge("d", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + std::unique_ptr db_iter( + DBIter::NewIter(env_, ro, ImmutableOptions(options), + MutableCFOptions(options), BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2471,11 +2416,10 @@ TEST_F(DBIteratorTest, DBIterator10) { internal_iter->AddPut("d", "4"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->Seek("c"); ASSERT_TRUE(db_iter->Valid()); @@ -2512,10 +2456,10 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) { internal_iter->AddPut("b", "2"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */)); + 10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -2542,11 +2486,10 @@ TEST_F(DBIteratorTest, DBIterator11) { internal_iter->AddMerge("b", "2"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 1 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 1 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -2571,10 +2514,10 @@ TEST_F(DBIteratorTest, DBIterator12) { internal_iter->AddSingleDeletion("b"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */)); + 10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -2610,11 +2553,11 @@ TEST_F(DBIteratorTest, DBIterator13) { internal_iter->AddPut(key, "8"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + options.max_sequential_skip_in_iterations = 3; + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 2 /* sequence */, 3 /* max_sequential_skip_in_iterations */, - nullptr /* read_callback */)); + 2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), key); @@ -2640,11 +2583,11 @@ TEST_F(DBIteratorTest, DBIterator14) { internal_iter->AddPut("c", "9"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + options.max_sequential_skip_in_iterations = 1; + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 4 /* sequence */, 1 /* max_sequential_skip_in_iterations */, - nullptr /* read_callback */)); + 4 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -2680,11 +2623,12 @@ class DBIterWithMergeIterTest : public testing::Test { InternalIterator* merge_iter = NewMergingIterator(&icomp_, child_iters.data(), 2u); - db_iter_.reset(NewDBIterator( + options_.max_sequential_skip_in_iterations = 3; + db_iter_.reset(DBIter::NewIter( env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_), BytewiseComparator(), merge_iter, nullptr /* version */, - 8 /* read data earlier than seqId 8 */, - 3 /* max iterators before reseek */, nullptr /* read_callback */)); + 8 /* read data earlier than seqId 8 */, nullptr /* read_callback */, + /*active_mem=*/nullptr)); } Env* env_; @@ -3120,11 +3064,10 @@ TEST_F(DBIteratorTest, SeekPrefixTombstones) { internal_iter->Finish(); ro.prefix_same_as_start = true; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); int skipped_keys = 0; @@ -3157,11 +3100,11 @@ TEST_F(DBIteratorTest, SeekToFirstLowerBound) { Slice lower_bound(lower_bound_str); ro.iterate_lower_bound = &lower_bound; Options options; - std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + std::unique_ptr db_iter( + DBIter::NewIter(env_, ro, ImmutableOptions(options), + MutableCFOptions(options), BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToFirst(); if (i == kNumKeys + 1) { @@ -3197,11 +3140,10 @@ TEST_F(DBIteratorTest, PrevLowerBound) { Slice lower_bound(lower_bound_str); ro.iterate_lower_bound = &lower_bound; Options options; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekToLast(); for (int i = kNumKeys; i >= kLowerBound; --i) { @@ -3226,11 +3168,10 @@ TEST_F(DBIteratorTest, SeekLessLowerBound) { Slice lower_bound(lower_bound_str); ro.iterate_lower_bound = &lower_bound; Options options; - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ro, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); auto before_lower_bound_str = std::to_string(kLowerBound - 1); Slice before_lower_bound(lower_bound_str); @@ -3252,11 +3193,10 @@ TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) { } internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( + std::unique_ptr db_iter(DBIter::NewIter( env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), internal_iter, nullptr /* version */, - 10 /* sequence */, options.max_sequential_skip_in_iterations, - nullptr /* read_callback */)); + 10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr)); db_iter->SeekForPrev("a"); ASSERT_TRUE(db_iter->Valid()); diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index ad3afd17f4f2..d2371abfa890 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -8,12 +8,15 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include +#include #include "db/arena_wrapped_db_iter.h" #include "db/db_iter.h" #include "db/db_test_util.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/io_dispatcher.h" #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" #include "table/block_based/flush_block_policy_impl.h" @@ -1839,11 +1842,6 @@ class SliceTransformLimitedDomainGeneric : public SliceTransform { // prefix will be x???? return src.size() >= 1; } - - bool InRange(const Slice& dst) const override { - // prefix will be x???? - return dst.size() == 1; - } }; TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) { @@ -2571,7 +2569,7 @@ TEST_P(DBIteratorTest, AutoRefreshIterator) { ReadOptions read_options; std::unique_ptr snapshot = nullptr; if (explicit_snapshot) { - snapshot = std::make_unique(db_); + snapshot = std::make_unique(db_.get()); } read_options.snapshot = explicit_snapshot ? snapshot->snapshot() : nullptr; @@ -3824,6 +3822,1576 @@ TEST_F(DBIteratorTest, IteratorsConsistentViewExplicitSnapshot) { } } +TEST_P(DBIteratorTest, MemtableOpsScanFlushTriggerWithSeek) { + // Tests that option memtable_op_scan_flush_trigger works when the limit + // is reached during a Seek() operation. + const int kTrigger = 10; + Random* r = Random::GetTLSInstance(); + + for (int trigger : {kTrigger, kTrigger + 1}) { + for (bool delete_only : {false, true}) { + Options options; + options.create_if_missing = true; + options.memtable_op_scan_flush_trigger = trigger; + options.level_compaction_dynamic_level_bytes = true; + DestroyAndReopen(options); + + // Base data that will be covered by a consecutive sequence of tombstones. + int kNumKeys = delete_only ? kTrigger : kTrigger / 2; + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), r->RandomString(100))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + + if (delete_only) { + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(SingleDelete(Key(i))); + } + } else { + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), r->RandomString(100))); + } + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Delete(Key(i))); + } + } + + SetPerfLevel(PerfLevel::kEnableCount); + get_perf_context()->Reset(); + ReadOptions ro; + std::unique_ptr iter(db_->NewIterator(ro)); + + // Seek to the first key, this will scan through all the tombstones and + // hidden puts + iter->Seek(Key(0)); + ASSERT_FALSE( + iter->Valid()); // All keys are deleted, so iterator is not valid + ASSERT_OK(iter->status()); + ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger); + + // Skipping kNumTrigger memtable entries in a single iterator operation + // should mark the memtable for flush. + // + // At the end of a write, we check and update memtable to request a flush + ASSERT_OK(Put(Key(11), "val")); + // Before a write, we schedule memtables for flush if requested. + ASSERT_OK(Put(Key(12), "val")); + ASSERT_OK(db_->WaitForCompact({})); + + if (trigger <= kTrigger) { + // Check if memtable was flushed due to scan trigger + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + uint64_t val = 0; + ASSERT_TRUE( + db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val)); + ASSERT_EQ(0, val); + } else { + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + uint64_t val = 0; + ASSERT_TRUE( + db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val)); + ASSERT_EQ(kNumKeys, val); + } + } + } +} + +TEST_P(DBIteratorTest, MemtableOpsScanFlushTriggerWithNext) { + // Tests that option memtable_op_scan_flush_trigger works when the limit + // is reached during a Next() operation, and not trigger a flush when + // the limit is reached across multiple Next() operations. + const int kTrigger = 10; + Random* r = Random::GetTLSInstance(); + + for (int trigger : {kTrigger, kTrigger + 1}) { + for (bool delete_only : {false, true}) { + Options options; + options.create_if_missing = true; + options.memtable_op_scan_flush_trigger = trigger; + options.level_compaction_dynamic_level_bytes = true; + DestroyAndReopen(options); + + // Base data that will be covered by a consecutive sequence of tombstones. + int kNumKeys = delete_only ? kTrigger : kTrigger / 2; + for (int i = 0; i <= kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), r->RandomString(100))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + + ASSERT_OK(Put(Key(0), "val")); + if (delete_only) { + for (int i = 1; i <= kNumKeys; ++i) { + ASSERT_OK(SingleDelete(Key(i))); + } + } else { + for (int i = 1; i <= kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), r->RandomString(100))); + } + for (int i = 1; i <= kNumKeys; ++i) { + ASSERT_OK(Delete(Key(i))); + } + } + + // Total number of tombstones and hidden puts scanned across multiple + // Next() operations below will be kTrigger, and it should not trigger a + // flush when the limit is kTrigger + 1. + ASSERT_OK(Put(Key(kNumKeys + 1), "v1")); + ASSERT_OK(Delete(Key(kNumKeys + 2))); + ASSERT_OK(Put(Key(kNumKeys + 3), "v3")); + + SetPerfLevel(PerfLevel::kEnableCount); + get_perf_context()->Reset(); + ReadOptions ro; + std::unique_ptr iter(db_->NewIterator(ro)); + iter->Seek(Key(0)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), "val"); + ASSERT_OK(iter->status()); + ASSERT_EQ(get_perf_context()->next_on_memtable_count, 0); + iter->Next(); + // kTrigger tombstones and invisible puts and 1 for the visible put + ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger + 1); + iter->Next(); + ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger + 3); + + // Skipping kNumTrigger memtable entries in a single iterator operation + // should mark the memtable for flush. + // + // At the end of a write, we check and update memtable to request a flush + ASSERT_OK(Put(Key(11), "val")); + // Before a write, we schedule memtables for flush if requested. + ASSERT_OK(Put(Key(12), "val")); + ASSERT_OK(db_->WaitForCompact({})); + + if (trigger <= kTrigger) { + // Check if memtable was flushed due to scan trigger + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + uint64_t val = 0; + ASSERT_TRUE( + db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val)); + ASSERT_EQ(0, val); + } else { + uint64_t val = 0; + ASSERT_TRUE( + db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val)); + ASSERT_EQ(kNumKeys + 1, val); + } + } + } +} + +TEST_P(DBIteratorTest, AverageMemtableOpsScanFlushTrigger) { + // Tests option memtable_avg_op_scan_flush_trigger with + // long tombstone sequences. + Random* r = Random::GetTLSInstance(); + + const int kAvgTrigger = 10; + const int kMaxTrigger = 500; + Options options; + options.create_if_missing = true; + options.memtable_op_scan_flush_trigger = kMaxTrigger; + options.memtable_avg_op_scan_flush_trigger = kAvgTrigger; + options.level_compaction_dynamic_level_bytes = true; + DestroyAndReopen(options); + + const int kNumKeys = 1000; + // Base data that will be covered by a consecutive sequence of tombstones. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), r->RandomString(50))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + + for (int i = 0; i < kNumKeys; ++i) { + // We issue slightly more deletions than kAvgTrigger between visible keys + // to ensure avg skipped entries exceed kAvgTrigger. + if (i % (kAvgTrigger + 2) != 0) { + ASSERT_OK(SingleDelete(Key(i))); + } + } + + // Each operation, except the first Seek, is expected to see kAvgTrigger + 1 + // tombstones (from the active memtable) before it finds the next visible key. + SetPerfLevel(PerfLevel::kEnableCount); + get_perf_context()->Reset(); + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(1)); + ASSERT_EQ(get_perf_context()->next_on_memtable_count, kAvgTrigger + 1); + iter.reset(); + // Should not flush since total entries skipped is below + // memtable_op_scan_flush_trigger + ASSERT_OK(Put(Key(0), "dummy write")); + ASSERT_OK(Put(Key(0), "dummy write")); + ASSERT_OK(db_->WaitForCompact({})); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + get_perf_context()->Reset(); + iter.reset(db_->NewIterator(ReadOptions())); + int num_ops = 1; + uint64_t num_skipped = 0; + iter->Seek(Key(0)); + ASSERT_EQ(iter->key(), Key(0)); + uint64_t last_memtable_next_count = + get_perf_context()->next_on_memtable_count; + iter->Next(); + num_ops++; + while (iter->Valid()) { + ASSERT_OK(iter->status()); + uint64_t num_skipped_in_op = + get_perf_context()->next_on_memtable_count - last_memtable_next_count; + ASSERT_GE(num_skipped_in_op, kAvgTrigger + 1); + last_memtable_next_count = get_perf_context()->next_on_memtable_count; + num_skipped += num_skipped_in_op; + iter->Next(); + num_ops++; + } + // During iterator destruction we mark memtable for flush + iter.reset(); + + // avg trigger + ASSERT_GE(num_skipped, kAvgTrigger * num_ops); + // memtable_op_scan_flush_trigger + ASSERT_GE(num_skipped, kMaxTrigger); + // Average hidden entries scanned from memtable per operation is more than + // kAvgTrigger and the total skipped is more than + // memtable_op_scan_flush_trigger, the current memtable should be marked for + // flush. The following two writes will trigger the flush. + ASSERT_OK(Put(Key(0), "dummy write")); + // Before a write, we schedule memtables for flush if requested. + ASSERT_OK(Put(Key(0), "dummy write")); + ASSERT_OK(db_->WaitForCompact({})); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +} + +TEST_P(DBIteratorTest, AverageMemtableOpsScanFlushTriggerByOverwrites) { + // Tests option memtable_avg_op_scan_flush_trigger with overwrites to keys. + Random* r = Random::GetTLSInstance(); + + const int kAvgTrigger = 25; + Options options; + options.create_if_missing = true; + options.memtable_op_scan_flush_trigger = 250; + options.memtable_avg_op_scan_flush_trigger = kAvgTrigger; + options.level_compaction_dynamic_level_bytes = true; + DestroyAndReopen(options); + + const int kNumKeys = 100; + // Base data that will be covered by a consecutive sequence of tombstones. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), r->RandomString(50))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + + // One visible key every 10 keys. + // Each non-visible user key has 3 non-visible entries in the active memtable. + for (int i = 0; i < kNumKeys; ++i) { + if (i % 10 != 0) { + ASSERT_OK(Put(Key(i), r->RandomString(50))); + ASSERT_OK(Put(Key(i), r->RandomString(50))); + ASSERT_OK(Delete(Key(i))); + } + } + + SetPerfLevel(PerfLevel::kEnableCount); + get_perf_context()->Reset(); + ReadOptions ro; + std::unique_ptr iter(db_->NewIterator(ro)); + iter->Seek(Key(1)); + ASSERT_GT(get_perf_context()->next_on_memtable_count, kAvgTrigger); + // Re-seek to trigger check for flush trigger + iter->Seek(Key(1)); + // Should not flush since total entries skipped is below + // memtable_op_scan_flush_trigger + ASSERT_FALSE(static_cast(db_->DefaultColumnFamily()) + ->cfd() + ->mem() + ->IsMarkedForFlush()); + ASSERT_OK(Put(Key(0), "dummy write")); + ASSERT_OK(Put(Key(0), "dummy write")); + ASSERT_OK(db_->WaitForCompact({})); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + get_perf_context()->Reset(); + + int num_ops = 1; + iter->Seek(Key(1)); + while (iter->Valid()) { + num_ops++; + iter->Next(); + } + ASSERT_GT(get_perf_context()->next_on_memtable_count, num_ops * kAvgTrigger); + + // Re-seek should check conditions for marking memtable for flush + iter->Seek(Key(80)); + + // Average hidden entries scanned from memtable per operation is 2. + ASSERT_OK(Put(Key(0), "dummy write")); + // Before a write, we schedule memtables for flush if requested. + ASSERT_OK(Put(Key(0), "dummy write")); + ASSERT_OK(db_->WaitForCompact({})); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +} + +class DBMultiScanIteratorTest : public DBTestBase, + public ::testing::WithParamInterface { + public: + DBMultiScanIteratorTest() + : DBTestBase("db_multi_scan_iterator_test", /*env_do_fsync=*/true) {} +}; + +// Param 0: ReadOptions::fill_cache +INSTANTIATE_TEST_CASE_P(DBMultiScanIteratorTest, DBMultiScanIteratorTest, + ::testing::Bool()); + +TEST_P(DBMultiScanIteratorTest, BasicTest) { + auto options = CurrentOptions(); + DestroyAndReopen(options); + + // Create a file + for (int i = 0; i < 100; ++i) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + ASSERT_OK(Put("k" + ss.str(), "val" + ss.str())); + } + ASSERT_OK(Flush()); + + std::vector key_ranges({"k03", "k10", "k25", "k50"}); + ReadOptions ro; + ro.fill_cache = GetParam(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + try { + int idx = 0; + int count = 0; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0); + ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0); + count++; + } + idx += 2; + } + ASSERT_EQ(count, 32); + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, MixedBoundsTest) { + auto options = CurrentOptions(); + DestroyAndReopen(options); + // Create a file + for (int i = 0; i < 100; ++i) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + ASSERT_OK(Put("k" + ss.str(), "val" + ss.str())); + } + ASSERT_OK(Flush()); + + std::vector key_ranges( + {"k03", "k10", "k25", "k50", "k75", "k90"}); + ReadOptions ro; + ro.fill_cache = GetParam(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2]); + scan_options.insert(key_ranges[4], key_ranges[5]); + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + try { + int idx = 0; + int count = 0; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE( + it.first.ToString().compare( + scan_options.GetScanRanges()[idx].range.start->ToString()), + 0); + if (scan_options.GetScanRanges()[idx].range.limit) { + ASSERT_LT( + it.first.ToString().compare( + scan_options.GetScanRanges()[idx].range.limit->ToString()), + 0); + } + count++; + } + idx++; + } + ASSERT_EQ(count, 97); + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); + scan_options = MultiScanArgs(BytewiseComparator()); + scan_options.insert(key_ranges[0]); + scan_options.insert(key_ranges[2], key_ranges[3]); + scan_options.insert(key_ranges[4]); + iter = dbfull()->NewMultiScan(ro, cfh, scan_options); + try { + int idx = 0; + int count = 0; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE( + it.first.ToString().compare( + scan_options.GetScanRanges()[idx].range.start->ToString()), + 0); + if (scan_options.GetScanRanges()[idx].range.limit) { + ASSERT_LT( + it.first.ToString().compare( + scan_options.GetScanRanges()[idx].range.limit->ToString()), + 0); + } + count++; + } + idx++; + } + ASSERT_EQ(count, 147); + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, RangeAcrossFiles) { + auto options = CurrentOptions(); + options.target_file_size_base = 100 << 10; // 20KB + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 50; + options.compression = kNoCompression; + DestroyAndReopen(options); + + auto rnd = Random::GetTLSInstance(); + // Write ~200KB data + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(Key(i), rnd->RandomString(2 << 10))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_EQ(2, NumTableFilesAtLevel(49)); + std::vector key_ranges({Key(10), Key(90)}); + ReadOptions ro; + ro.fill_cache = GetParam(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + try { + int i = 10; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_EQ(it.first.ToString(), Key(i)); + ++i; + } + } + ASSERT_EQ(i, 90); + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, FailureTest) { + auto options = CurrentOptions(); + options.compression = kNoCompression; + DestroyAndReopen(options); + + Random rnd(301); + // Create a file + for (int i = 0; i < 100; ++i) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + + std::vector key_ranges({"k04", "k06", "k12", "k14"}); + ReadOptions ro; + Slice ub; + ro.iterate_upper_bound = &ub; + ro.fill_cache = GetParam(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + scan_options.max_prefetch_size = 4500; + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + std::unique_ptr iter(dbfull()->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + iter->Prepare(scan_options); + int count = 0; + ub = key_ranges[1]; + iter->Seek(key_ranges[0]); + while (iter->status().ok() && iter->Valid()) { + ASSERT_GE(iter->key().compare(key_ranges[0]), 0); + ASSERT_LT(iter->key().compare(key_ranges[1]), 0); + count++; + iter->Next(); + } + ASSERT_OK(iter->status()) << iter->status().ToString(); + ASSERT_EQ(count, 2); + + // Second seek should hit the max_prefetch_size limit + ub = key_ranges[3]; + iter->Seek(key_ranges[2]); + ASSERT_NOK(iter->status()); + iter.reset(); + + // Test the case of unexpected Seek key + iter.reset(dbfull()->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + scan_options.max_prefetch_size = 0; + iter->Prepare(scan_options); + ub = key_ranges[3]; + iter->Seek(key_ranges[2]); + ASSERT_NOK(iter->status()); + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, OutOfL0FileRange) { + // Test that prepare does not fail scan when a scan range + // is outside of a L0 file's key range. + auto options = CurrentOptions(); + options.compression = kNoCompression; + DestroyAndReopen(options); + + Random rnd(301); + // Create a Lmax file + // key01 ~ key99 + for (int i = 0; i < 100; ++i) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Create a L0 file + // key00 ~ key09 + for (int i = 0; i < 10; ++i) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // The second range is outside of L0 file's key range + std::vector key_ranges({"k04", "k06", "k12", "k14"}); + ReadOptions ro; + Slice ub; + ro.iterate_upper_bound = &ub; + ro.fill_cache = GetParam(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + std::unique_ptr iter(dbfull()->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + iter->Prepare(scan_options); + int count = 0; + ub = key_ranges[1]; + iter->Seek(key_ranges[0]); + while (iter->status().ok() && iter->Valid()) { + ASSERT_GE(iter->key().compare(key_ranges[0]), 0); + ASSERT_LT(iter->key().compare(key_ranges[1]), 0); + count++; + iter->Next(); + } + ASSERT_OK(iter->status()) << iter->status().ToString(); + ASSERT_EQ(count, 2); + + ub = key_ranges[3]; + count = 0; + iter->Seek(key_ranges[2]); + while (iter->status().ok() && iter->Valid()) { + ASSERT_GE(iter->key().compare(key_ranges[2]), 0); + ASSERT_LT(iter->key().compare(key_ranges[3]), 0); + count++; + iter->Next(); + } + ASSERT_OK(iter->status()) << iter->status().ToString(); + ASSERT_EQ(count, 2); +} + +TEST_P(DBMultiScanIteratorTest, RangeBetweenFiles) { + auto options = CurrentOptions(); + options.target_file_size_base = 100 << 10; // 20KB + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 50; + options.compression = kNoCompression; + DestroyAndReopen(options); + + auto rnd = Random::GetTLSInstance(); + // Write ~200KB data + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(Key(i), rnd->RandomString(2 << 10))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_EQ(2, NumTableFilesAtLevel(49)); + + // Test with a scan range that overlaps an entire file, with upper bound + // between 2 files + std::vector file_meta; + dbfull()->GetLiveFilesMetaData(&file_meta); + ASSERT_EQ(file_meta.size(), 2); + std::vector key_ranges(4); + key_ranges[0] = file_meta[0].smallestkey; + key_ranges[1] = file_meta[0].largestkey + "0"; + key_ranges[2] = file_meta[1].smallestkey + "0"; + key_ranges[3] = file_meta[1].largestkey; + ReadOptions ro; + ro.fill_cache = GetParam(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + try { + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE(it.first.ToString(), key_ranges[0]); + } + } + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); + + // Test multiscan with a range entirely between adjacent files + key_ranges[0] = file_meta[0].largestkey + "0"; + key_ranges[1] = file_meta[0].largestkey + "1"; + key_ranges[2] = file_meta[1].smallestkey + "0"; + key_ranges[3] = file_meta[1].largestkey; + (*scan_options).clear(); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + iter = dbfull()->NewMultiScan(ro, cfh, scan_options); + try { + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE(it.first.ToString(), key_ranges[0]); + } + } + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); +} + +// This test case tests multiscan in the presence of fragmented range +// tombstones in the LSM. +TEST_P(DBMultiScanIteratorTest, FragmentedRangeTombstones) { + auto options = CurrentOptions(); + // Compaction may create files 2x the target_file_size_base, + // so set this to 50KB so we atleast end up with 2 files of + // 100KB + options.target_file_size_base = 50 << 10; // 50KB + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 50; + options.compression = kNoCompression; + DestroyAndReopen(options); + + // Setup the LSM as follows - + // 1. Ingest a file with 100 keys + // 2. Ingest a file with one overlapping key + // 3. Do a Put and flush a file to L0 with one overlapping key + // 4. Ingest a standalone delete range file that covers the full key space + // and a file with the same 100 keys with new values. This will ingest + // into L0 due to the presence of an existing file in L0 + // The final LSM will have an SST in Lmax with 100 keys, and 2 SST files + // in Lmax-1 with half the keys each and completely overlapping delete ranges + std::unordered_map kvs; + auto rnd = Random::GetTLSInstance(); + auto create_ingestion_data_file_and_update_key_value = + [&](const std::string& filename, int start_key, int end_key) { + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(filename)); + for (int i = start_key; i < end_key; ++i) { + auto kiter = kvs.find(Key(i)); + if (kiter != kvs.end()) { + kvs.erase(kiter); + } + auto res = + kvs.emplace(std::make_pair(Key(i), rnd->RandomString(2 << 10))); + ASSERT_OK(writer->Put(res.first->first, res.first->second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + }; + + CreateColumnFamilies({"new_cf"}, options); + std::string ingest_file = dbname_ + "test.sst"; + // Write ~200KB data + create_ingestion_data_file_and_update_key_value(ingest_file + "_0", 0, 100); + create_ingestion_data_file_and_update_key_value(ingest_file + "_1", 50, 51); + ColumnFamilyHandle* cfh = handles_[0]; + IngestExternalFileOptions ifo; + Status s = dbfull()->IngestExternalFile( + cfh, {ingest_file + "_0", ingest_file + "_1"}, ifo); + ASSERT_OK(s); + + ASSERT_OK(Put(0, Key(50), rnd->RandomString(2 << 10))); + ASSERT_OK(Flush()); + + { + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file + "_2")); + ASSERT_OK(writer->DeleteRange("a", "z")); + ASSERT_OK(writer->Finish()); + writer.reset(); + } + create_ingestion_data_file_and_update_key_value(ingest_file + "_3", 0, 100); + s = dbfull()->IngestExternalFile( + cfh, {ingest_file + "_2", ingest_file + "_3"}, ifo); + ASSERT_OK(s); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(cfh, &cf_meta); + // Only the L0 with range deletion is compacted. + ASSERT_EQ(1, cf_meta.levels[0].files.size()); + ASSERT_EQ(0, cf_meta.levels[0].files[0].num_deletions); + + // The first scan range overlaps the DB key range, while the second extends + // beyond but overlaps the delete range + std::vector key_ranges({"key000085", "key000090", "l", "n"}); + ReadOptions ro; + ro.fill_cache = GetParam(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + try { + int i = 0; + int count = 0; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE(it.first.ToString(), key_ranges[i]); + ASSERT_LT(it.first.ToString(), key_ranges[i + 1]); + auto kiter = kvs.find(it.first.ToString()); + ASSERT_NE(kiter, kvs.end()); + ASSERT_EQ(kiter->second, it.second.ToString()); + count++; + } + i += 2; + } + ASSERT_EQ(i, 4); + ASSERT_EQ(count, 5); + } catch (MultiScanException& ex) { + ASSERT_OK(ex.status()); + } + iter.reset(); + + // The second scan range start overlaps the delete range in the first file + // in Lmax-1, while the end overlaps the keys in the second file + (*scan_options).clear(); + key_ranges[0] = "key000010"; + key_ranges[1] = "key000020"; + key_ranges[2] = "key0000500"; + key_ranges[3] = "key000060"; + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + iter = dbfull()->NewMultiScan(ro, cfh, scan_options); + try { + int i = 0; + int count = 0; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE(it.first.ToString(), key_ranges[i]); + ASSERT_LT(it.first.ToString(), key_ranges[i + 1]); + auto kiter = kvs.find(it.first.ToString()); + ASSERT_NE(kiter, kvs.end()); + ASSERT_EQ(kiter->second, it.second.ToString()); + count++; + } + i += 2; + } + ASSERT_EQ(i, 4); + ASSERT_EQ(count, 19); + } catch (MultiScanException& ex) { + ASSERT_OK(ex.status()); + } + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, ReseekAcrossBlocksSameUserKey) { + // This test exposes a bug where multiscan reseeks backwards when + // max_sequential_skip_in_iterations is triggered with the same user key + // spanning multiple data blocks. + + auto options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + options.compression = kNoCompression; + + // Force each internal key into its own block + BlockBasedTableOptions table_options; + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Taking a snapshot after each Put to preserve all versions during flush. + std::vector snapshots; + for (int i = 0; i < 7; ++i) { + ASSERT_OK(Put("key_a", "value_" + std::to_string(i))); + snapshots.push_back(db_->GetSnapshot()); + } + ASSERT_OK(Put("key_b", "value_b")); + + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // Setup multiscan range covering both keys + std::vector key_ranges({"key_a", "key_c"}); + ReadOptions ro; + Slice ub = key_ranges[1]; + ro.iterate_upper_bound = &ub; + ro.fill_cache = GetParam(); + + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + std::unique_ptr iter(dbfull()->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + iter->Prepare(scan_options); + + std::vector seen_keys; + std::vector seen_values; + iter->Seek(key_ranges[0]); + while (iter->status().ok() && iter->Valid()) { + seen_keys.push_back(iter->key().ToString()); + seen_values.push_back(iter->value().ToString()); + iter->Next(); + } + ASSERT_OK(iter->status()) << iter->status().ToString(); + + ASSERT_EQ(seen_keys.size(), 2) << "Should see key_a and key_b"; + ASSERT_EQ(seen_keys[0], "key_a"); + ASSERT_EQ(seen_keys[1], "key_b"); + ASSERT_EQ(seen_values[0], "value_6"); + ASSERT_EQ(seen_values[1], "value_b"); + + for (auto* snapshot : snapshots) { + db_->ReleaseSnapshot(snapshot); + } +} + +TEST_P(DBMultiScanIteratorTest, AsyncPrefetchAcrossMultipleFiles) { + // Test async prefetch with multiple ranges within a single file + auto options = CurrentOptions(); + options.target_file_size_base = 1 << 15; // 32KiB + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 50; + options.compression = kNoCompression; + options.statistics = CreateDBStatistics(); + DestroyAndReopen(options); + + Random rnd(303); + + // Create a single large file with many keys + // ~1MiB of data + // Should be lots of files now + for (int i = 0; i < 1000; ++i) { + std::stringstream ss; + ss << "k" << std::setw(5) << std::setfill('0') << i; + // 1KiB values + ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + + ASSERT_GT(NumTableFilesAtLevel(49), 3); + + // Set up multiple non-overlapping ranges in the same file + // Every 32 values should be a file or so + std::vector key_ranges( + {"k00000", "k00100", "k00500", "k00600", "k00800", "k00900"}); + ReadOptions ro; + ro.fill_cache = GetParam(); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = true; + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + scan_options.insert(key_ranges[4], key_ranges[5]); + + auto read_count_before = + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + auto read_count_after = + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + ASSERT_EQ(read_count_after, read_count_before); + + // Verify all three ranges can be scanned successfully + try { + for (auto range : *iter) { + for (auto it : range) { + it.first.ToString(); + } + } + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, AsyncPrefetchMultipleLevels) { + // Test async prefetch with files in L0 and non-L0 levels + // Similar setup to AsyncPrefetchAcrossMultipleFiles but with L0 files + auto options = CurrentOptions(); + options.target_file_size_base = 1 << 15; // 32KiB + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 50; + options.compression = kNoCompression; + options.statistics = CreateDBStatistics(); + DestroyAndReopen(options); + + Random rnd(304); + + // Create base files and compact to bottom level - ~500KiB of data + for (int i = 0; i < 500; ++i) { + std::stringstream ss; + ss << "k" << std::setw(5) << std::setfill('0') << i; + ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + + // Verify we have files at bottom level + ASSERT_GT(NumTableFilesAtLevel(49), 0); + + // Create additional L0 files with overlapping key ranges + for (int i = 100; i < 150; ++i) { + std::stringstream ss; + ss << "k" << std::setw(5) << std::setfill('0') << i; + ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10))); + } + ASSERT_OK(Flush()); + + // Verify we now have files in both L0 and bottom level + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(49), 0); + + // Set up multiple non-overlapping ranges + std::vector key_ranges( + {"k00000", "k00100", "k00200", "k00300", "k00400", "k00500"}); + ReadOptions ro; + ro.fill_cache = GetParam(); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = true; + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + scan_options.insert(key_ranges[4], key_ranges[5]); + + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + + // Verify all three ranges can be scanned successfully + int total_keys = 0; + try { + for (auto range : *iter) { + for (auto it : range) { + it.first.ToString(); + total_keys++; + } + } + } catch (MultiScanException& ex) { + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + + // Should have keys from all three ranges + ASSERT_GT(total_keys, 0); + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, AsyncPrefetchWithDeleteRange) { + // Test async prefetch with delete ranges + auto options = CurrentOptions(); + options.target_file_size_base = 1 << 15; // 32KiB + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 50; + options.compression = kNoCompression; + DestroyAndReopen(options); + + Random rnd(305); + + // Create base data - ~500KiB + for (int i = 0; i < 500; ++i) { + std::stringstream ss; + ss << "k" << std::setw(5) << std::setfill('0') << i; + ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10))); + } + ASSERT_OK(Flush()); + + // Add delete ranges + ASSERT_OK(db_->DeleteRange(WriteOptions(), dbfull()->DefaultColumnFamily(), + "k00100", "k00200")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_GT(NumTableFilesAtLevel(49), 0); + + // Set up scan ranges that interact with delete ranges + std::vector key_ranges({"k00000", "k00500"}); + ReadOptions ro; + ro.fill_cache = GetParam(); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = true; + scan_options.insert(key_ranges[0], key_ranges[1]); + + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + + // Verify ranges can be scanned successfully + int total_keys = 0; + try { + for (auto range : *iter) { + for (auto it : range) { + std::string key = it.first.ToString(); + // Verify deleted keys are not returned + ASSERT_TRUE((key < "k00100" || key >= "k00200")); + total_keys++; + } + } + } catch (MultiScanException& ex) { + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + + // Should have keys excluding deleted ranges + ASSERT_EQ(total_keys, 400); + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, AsyncPrefetchWithExternalFileIngestion) { + // Test async prefetch with externally ingested files + auto options = CurrentOptions(); + options.target_file_size_base = 1 << 15; // 32KiB + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 50; + options.compression = kNoCompression; + DestroyAndReopen(options); + + Random rnd(306); + + // Create base data - ~200KiB + for (int i = 0; i < 200; ++i) { + std::stringstream ss; + ss << "k" << std::setw(5) << std::setfill('0') << i; + ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10))); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + + // Create and ingest external SST file with new data + std::string ingest_file = dbname_ + "/test_ingest.sst"; + { + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file)); + for (int i = 300; i < 500; ++i) { + std::stringstream ss; + ss << "k" << std::setw(5) << std::setfill('0') << i; + ASSERT_OK(writer->Put(ss.str(), rnd.RandomString(1 << 10))); + } + ASSERT_OK(writer->Finish()); + } + + IngestExternalFileOptions ifo; + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + ASSERT_OK(dbfull()->IngestExternalFile(cfh, {ingest_file}, ifo)); + + // Set up scan ranges that span both regular and ingested files + std::vector key_ranges({"k00000", "k00500"}); + ReadOptions ro; + ro.fill_cache = GetParam(); + + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = true; + scan_options.insert(key_ranges[0], key_ranges[1]); + + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + + // Verify all ranges can be scanned successfully + int total_keys = 0; + try { + for (auto range : *iter) { + for (auto it : range) { + it.first.ToString(); + total_keys++; + } + } + } catch (MultiScanException& ex) { + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + + ASSERT_EQ(total_keys, 400); + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, IODispatcherStatsVerification) { + // Test that verifies all IOs go through the IODispatcher by checking stats + auto options = CurrentOptions(); + options.target_file_size_base = 1 << 15; // 32KiB + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 50; + options.compression = kNoCompression; + DestroyAndReopen(options); + + Random rnd(307); + + // Create data - enough to create multiple data blocks + for (int i = 0; i < 500; ++i) { + std::stringstream ss; + ss << "k" << std::setw(5) << std::setfill('0') << i; + ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10))); // 1KiB values + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + + // Set up scan ranges + std::vector key_ranges({"k00000", "k00200", "k00300", "k00400"}); + ReadOptions ro; + ro.fill_cache = GetParam(); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + + // Create a tracking IODispatcher to verify IO statistics + auto tracking_dispatcher = std::make_shared(); + + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = false; // Use sync IO for predictable stats + scan_options.io_dispatcher = tracking_dispatcher; + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + + // Scan through all data + int total_keys = 0; + try { + for (auto range : *iter) { + for (auto it : range) { + it.first.ToString(); + total_keys++; + } + } + } catch (MultiScanException& ex) { + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + + // We scanned ~200 keys in range 1 and ~100 keys in range 2 + ASSERT_EQ(total_keys, 300); + + // Verify that IO operations went through the IODispatcher + // The total IO operations should be > 0 (either sync reads, async reads, or + // cache hits) + uint64_t total_ops = tracking_dispatcher->GetTotalIOOperations(); + ASSERT_GT(total_ops, 0) << "Expected some IO operations through IODispatcher"; + + // Verify that we have at least one ReadSet created + ASSERT_GT(tracking_dispatcher->GetReadSets().size(), 0) + << "Expected at least one ReadSet to be created"; + + // Since we used sync IO, we should have sync reads (or cache hits if cached) + uint64_t sync_reads = tracking_dispatcher->GetTotalSyncReads(); + uint64_t cache_hits = tracking_dispatcher->GetTotalCacheHits(); + ASSERT_GT(sync_reads + cache_hits, 0) + << "Expected sync reads or cache hits for sync IO mode"; + + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, IODispatcherPrefetchKnownBlocks) { + // Test that verifies we prefetch a known/expected number of blocks. + // Uses FlushBlockEveryKeyPolicyFactory to create exactly one block per key, + // making the block count predictable and verifiable. + auto options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + + // Configure to create exactly one block per key + BlockBasedTableOptions table_options; + table_options.flush_block_policy_factory = + std::make_shared(); + // Use a block cache (required by IODispatcher), but use a fresh one + // that won't have any cached data + table_options.block_cache = NewLRUCache(10 * 1024 * 1024); // 10MB cache + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create exactly 100 keys, each in its own block + const int kNumKeys = 100; + const int kValueSize = 100; // Fixed value size for predictability + std::string value(kValueSize, 'v'); + + for (int i = 0; i < kNumKeys; ++i) { + std::stringstream ss; + ss << "k" << std::setw(3) << std::setfill('0') << i; + ASSERT_OK(Put(ss.str(), value)); + } + ASSERT_OK(Flush()); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + + // Create a tracking IODispatcher to verify IO statistics + auto tracking_dispatcher = std::make_shared(); + + // Define scan ranges with known block counts: + // Range 1: k000 to k020 (20 keys = 20 blocks) + // Range 2: k050 to k060 (10 keys = 10 blocks) + // Total expected blocks to read: 30 + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = false; // Use sync IO for predictable stats + scan_options.io_dispatcher = tracking_dispatcher; + scan_options.insert("k000", "k020"); + scan_options.insert("k050", "k060"); + + ReadOptions ro; + ro.fill_cache = false; // Don't fill cache, ensure fresh reads + + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + + // Scan through all data and count keys + int total_keys = 0; + try { + for (auto range : *iter) { + for (auto it : range) { + it.first.ToString(); + total_keys++; + } + } + } catch (MultiScanException& ex) { + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + + // Verify we scanned the expected number of keys + // Range 1: k000-k019 = 20 keys, Range 2: k050-k059 = 10 keys + ASSERT_EQ(total_keys, 30) << "Expected 30 keys from two ranges"; + + // Verify IODispatcher statistics + uint64_t total_ops = tracking_dispatcher->GetTotalIOOperations(); + uint64_t sync_reads = tracking_dispatcher->GetTotalSyncReads(); + + // We should have at least as many IO operations as blocks we need to read + // (could be more due to index/filter blocks) + ASSERT_GE(total_ops, 30) + << "Expected at least 30 IO operations for 30 data blocks"; + + // Since cache is fresh and fill_cache=false, all should be sync reads + ASSERT_GE(sync_reads, 30) + << "Expected at least 30 sync reads for 30 data blocks"; + + // Verify we created ReadSets (one per range) + size_t num_readsets = tracking_dispatcher->GetReadSets().size(); + ASSERT_GE(num_readsets, 1) << "Expected at least one ReadSet"; + + // Log the stats for debugging + std::cout << "IODispatcher Stats: total_ops=" << total_ops + << ", sync_reads=" << sync_reads + << ", async_reads=" << tracking_dispatcher->GetTotalAsyncReads() + << ", cache_hits=" << tracking_dispatcher->GetTotalCacheHits() + << ", readsets=" << num_readsets << std::endl; + + iter.reset(); +} + +TEST_P(DBMultiScanIteratorTest, IODispatcherCacheHitVerification) { + // Test that verifies cache hits are properly tracked through IODispatcher. + // First scan populates cache, second scan should show cache hits. + auto options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + + BlockBasedTableOptions table_options; + table_options.flush_block_policy_factory = + std::make_shared(); + // Enable block cache with enough space for all blocks + table_options.block_cache = NewLRUCache(10 * 1024 * 1024); // 10MB cache + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create 50 keys, each in its own block + const int kNumKeys = 50; + std::string value(100, 'v'); + + for (int i = 0; i < kNumKeys; ++i) { + std::stringstream ss; + ss << "k" << std::setw(3) << std::setfill('0') << i; + ASSERT_OK(Put(ss.str(), value)); + } + ASSERT_OK(Flush()); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + + // First scan: populate the cache + { + auto dispatcher1 = std::make_shared(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = false; + scan_options.io_dispatcher = dispatcher1; + scan_options.insert("k000", "k025"); // 25 keys + + ReadOptions ro; + ro.fill_cache = true; // Fill cache on first scan + + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + + int count = 0; + try { + for (auto range : *iter) { + for (auto it : range) { + it.first.ToString(); + count++; + } + } + } catch (MultiScanException& ex) { + FAIL() << "First scan failed: " << ex.what(); + } + ASSERT_EQ(count, 25); + + // First scan should have sync reads (cache was empty) + uint64_t first_sync = dispatcher1->GetTotalSyncReads(); + ASSERT_GE(first_sync, 25) << "First scan should have sync reads"; + + std::cout << "First scan stats: sync_reads=" << first_sync + << ", cache_hits=" << dispatcher1->GetTotalCacheHits() + << std::endl; + } + + // Second scan: should get cache hits + { + auto dispatcher2 = std::make_shared(); + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = false; + scan_options.io_dispatcher = dispatcher2; + scan_options.insert("k000", "k025"); // Same range as before + + ReadOptions ro; + ro.fill_cache = true; + + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + + int count = 0; + try { + for (auto range : *iter) { + for (auto it : range) { + it.first.ToString(); + count++; + } + } + } catch (MultiScanException& ex) { + FAIL() << "Second scan failed: " << ex.what(); + } + ASSERT_EQ(count, 25); + + // Second scan should have cache hits (blocks were cached in first scan) + uint64_t second_cache_hits = dispatcher2->GetTotalCacheHits(); + uint64_t second_sync = dispatcher2->GetTotalSyncReads(); + + std::cout << "Second scan stats: sync_reads=" << second_sync + << ", cache_hits=" << second_cache_hits << std::endl; + + // We expect cache hits on the second scan for data blocks + // Note: Some blocks might still need sync reads (e.g., if cache was + // evicted) + ASSERT_GE(second_cache_hits, 20) + << "Second scan should have cache hits for most blocks"; + } +} + +TEST_P(DBMultiScanIteratorTest, WastedBlocksTracking) { + // Test that verifies wasted prefetch blocks are properly tracked. + // When blocks are prefetched but skipped (e.g., due to seek), they should + // be counted as wasted and recorded to MULTISCAN_PREFETCH_BLOCKS_WASTED. + auto options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = NewLRUCache(10 * 1024 * 1024); // 10MB cache + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create 100 keys, each in its own block + const int kNumKeys = 100; + std::string value(100, 'v'); + + for (int i = 0; i < kNumKeys; ++i) { + std::stringstream ss; + ss << "k" << std::setw(3) << std::setfill('0') << i; + ASSERT_OK(Put(ss.str(), value)); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + + // Reset the wasted blocks counter before test + options.statistics->setTickerCount(MULTISCAN_PREFETCH_BLOCKS_WASTED, 0); + + // Set up MultiScan with two non-contiguous ranges: + // Range 1: k000-k020 (20 keys/blocks) + // Range 2: k050-k070 (20 keys/blocks) + // The blocks between k020-k050 (30 blocks) should be wasted if prefetched + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.use_async_io = false; + scan_options.insert("k000", "k020"); + scan_options.insert("k050", "k070"); + + ReadOptions ro; + ro.fill_cache = GetParam(); + + { + std::unique_ptr iter = + dbfull()->NewMultiScan(ro, cfh, scan_options); + ASSERT_NE(iter, nullptr); + + int count = 0; + try { + for (auto range : *iter) { + for (auto it : range) { + it.first.ToString(); + count++; + } + } + } catch (MultiScanException& ex) { + FAIL() << "Scan failed: " << ex.what(); + } + + // We should have scanned 40 keys total (20 + 20) + ASSERT_EQ(count, 40); + } // Iterator destroyed here, wasted blocks recorded + + // Check that wasted blocks were recorded + // The exact count depends on how many blocks were prefetched between ranges + uint64_t wasted = + options.statistics->getTickerCount(MULTISCAN_PREFETCH_BLOCKS_WASTED); + + // We expect some wasted blocks due to the gap between ranges + // The exact number depends on prefetch behavior, but should be > 0 + // if blocks between k020-k050 were prefetched + std::cout << "Wasted blocks: " << wasted << std::endl; + + // Note: The test verifies the tracking mechanism works. + // The actual count depends on prefetch heuristics which may vary. +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_kv_checksum_test.cc b/db/db_kv_checksum_test.cc index 6eea6e5b4ba0..7d18688f0788 100644 --- a/db/db_kv_checksum_test.cc +++ b/db/db_kv_checksum_test.cc @@ -312,12 +312,12 @@ TEST_P(DbKvChecksumTest, WriteToWALCorrupted) { // Corrupted write batch leads to read-only mode, so we have to // reopen for every attempt. Reopen(options); - auto log_size_pre_write = dbfull()->TEST_total_log_size(); + auto log_size_pre_write = dbfull()->TEST_wals_total_size(); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption()); // Confirm that nothing was written to WAL - ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size()); + ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size()); ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); SyncPoint::GetInstance()->DisableProcessing(); @@ -350,12 +350,12 @@ TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) { // Corrupted write batch leads to read-only mode, so we have to // reopen for every attempt. ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); - auto log_size_pre_write = dbfull()->TEST_total_log_size(); + auto log_size_pre_write = dbfull()->TEST_wals_total_size(); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption()); // Confirm that nothing was written to WAL - ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size()); + ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size()); ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); SyncPoint::GetInstance()->DisableProcessing(); @@ -487,7 +487,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) { // Reopen DB since it failed WAL write which lead to read-only mode Reopen(options); SyncPoint::GetInstance()->EnableProcessing(); - auto log_size_pre_write = dbfull()->TEST_total_log_size(); + auto log_size_pre_write = dbfull()->TEST_wals_total_size(); leader_batch_and_status = GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_), 8 /* protection_bytes_per_key */, op_type1_); @@ -499,7 +499,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) { SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait"); ASSERT_EQ(1, leader_count); // Nothing should have been written to WAL - ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size()); + ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size()); ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); corrupt_byte_offset++; @@ -599,7 +599,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) { // Reopen DB since it failed WAL write which lead to read-only mode ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options); SyncPoint::GetInstance()->EnableProcessing(); - auto log_size_pre_write = dbfull()->TEST_total_log_size(); + auto log_size_pre_write = dbfull()->TEST_wals_total_size(); leader_batch_and_status = GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_), 8 /* protection_bytes_per_key */, op_type1_); @@ -612,7 +612,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) { ASSERT_EQ(1, leader_count); // Nothing should have been written to WAL - ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size()); + ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size()); ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption()); corrupt_byte_offset++; diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc index 17163210e82f..62b1f893d5c2 100644 --- a/db/db_log_iter_test.cc +++ b/db/db_log_iter_test.cc @@ -180,13 +180,15 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckWhenArchive) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(dbfull()->Flush(FlushOptions(), cf)); + // Try lots of things to ensure callback is triggered + ASSERT_OK(dbfull()->TEST_SwitchWAL()); + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); + ASSERT_OK(dbfull()->TEST_WaitForPurge()); delete cf; - // Normally hit several times; WART: perhaps more in parallel after flush - // FIXME: this test is flaky - // ASSERT_TRUE(callback_hit.LoadRelaxed()); + ASSERT_TRUE(callback_hit.LoadRelaxed()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Close(); } while (ChangeCompactOptions()); - Close(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } #endif diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc index ff56d56e370d..a2de4e33e417 100644 --- a/db/db_logical_block_size_cache_test.cc +++ b/db/db_logical_block_size_cache_test.cc @@ -67,7 +67,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) { options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}}; for (int i = 0; i < 2; i++) { - DB* db; + std::unique_ptr db; if (!i) { printf("Open\n"); ASSERT_OK(DB::Open(options, dbname_, &db)); @@ -82,7 +82,6 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) { ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); ASSERT_OK(db->Close()); ASSERT_EQ(0, cache_->Size()); - delete db; } ASSERT_OK(DestroyDB(dbname_, options, {})); } @@ -95,7 +94,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) { options.env = env_.get(); for (int i = 0; i < 2; i++) { - DB* db; + std::unique_ptr db; if (!i) { printf("Open\n"); ASSERT_OK(DB::Open(options, dbname_, &db)); @@ -106,7 +105,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) { ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); ASSERT_EQ(1, cache_->GetRefCount(dbname_)); - delete db; + db.reset(); ASSERT_EQ(0, cache_->Size()); } ASSERT_OK(DestroyDB(dbname_, options, {})); @@ -122,7 +121,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) { ColumnFamilyOptions cf_options; cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}}; - DB* db; + std::unique_ptr db; ASSERT_OK(DB::Open(options, dbname_, &db)); ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); @@ -153,7 +152,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) { ASSERT_TRUE(cache_->Contains(dbname_)); ASSERT_EQ(1, cache_->GetRefCount(dbname_)); - delete db; + db.reset(); ASSERT_EQ(0, cache_->Size()); ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}})); } @@ -173,7 +172,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { ColumnFamilyOptions cf_options; cf_options.cf_paths = {{cf_path_0_, 1024}}; - DB* db; + std::unique_ptr db; ASSERT_OK(DB::Open(options, dbname_, &db)); ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); @@ -211,7 +210,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); ASSERT_TRUE(cache_->Contains(dbname_)); ASSERT_EQ(1, cache_->GetRefCount(dbname_)); - delete db; + db.reset(); // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry // is dropped from cache @@ -233,15 +232,15 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) { cf_options.cf_paths = {{cf_path_0_, 1024}}; for (int i = 0; i < 2; i++) { - DB* db; + std::unique_ptr db; + ASSERT_OK(DB::Open(options, dbname_, &db)); ColumnFamilyHandle* cf1 = nullptr; ColumnFamilyHandle* cf2 = nullptr; - ASSERT_OK(DB::Open(options, dbname_, &db)); ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1)); ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2)); ASSERT_OK(db->DestroyColumnFamilyHandle(cf1)); ASSERT_OK(db->DestroyColumnFamilyHandle(cf2)); - delete db; + db.reset(); ASSERT_EQ(0, cache_->Size()); std::vector cfs; @@ -298,7 +297,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) { ASSERT_TRUE(cache_->Contains(dbname_)); ASSERT_EQ(1, cache_->GetRefCount(dbname_)); - delete db; + db.reset(); ASSERT_EQ(0, cache_->Size()); } ASSERT_OK( @@ -315,7 +314,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) { ColumnFamilyOptions cf_options; cf_options.cf_paths = {{cf_path_0_, 1024}}; - DB* db; + std::unique_ptr db; ASSERT_OK(DB::Open(options, dbname_, &db)); ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); @@ -336,7 +335,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) { ASSERT_TRUE(cache_->Contains(cf_path_0_)); ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); - delete db; + db.reset(); ASSERT_EQ(0, cache_->Size()); // Open with column families. @@ -369,7 +368,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) { ASSERT_TRUE(cache_->Contains(cf_path_0_)); ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); - delete db; + db.reset(); ASSERT_EQ(0, cache_->Size()); } ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}})); @@ -384,7 +383,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { ASSERT_OK(env_->CreateDirIfMissing(dbname_)); - DB* db0; + std::unique_ptr db0; ASSERT_OK(DB::Open(options, data_path_0_, &db0)); ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); @@ -399,7 +398,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { ASSERT_TRUE(cache_->Contains(cf_path_0_)); ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); - DB* db1; + std::unique_ptr db1; ASSERT_OK(DB::Open(options, data_path_1_, &db1)); ASSERT_EQ(3, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); @@ -424,7 +423,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0)); - delete db0; + db0.reset(); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_1_)); ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); @@ -433,7 +432,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}})); ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1)); - delete db1; + db1.reset(); ASSERT_EQ(0, cache_->Size()); ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}})); } @@ -450,7 +449,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { ASSERT_OK(env_->CreateDirIfMissing(dbname_)); - DB* db0; + std::unique_ptr db0; ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0)); ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); @@ -464,7 +463,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { ASSERT_TRUE(cache_->Contains(cf_path_0_)); ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); - DB* db1; + std::unique_ptr db1; ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1)); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); @@ -481,7 +480,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0)); - delete db0; + db0.reset(); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); @@ -490,7 +489,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}})); ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1)); - delete db1; + db1.reset(); ASSERT_EQ(0, cache_->Size()); ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}})); } diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 3f7b029572e4..1086401dd3f9 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -117,8 +117,6 @@ class TestPrefixExtractor : public SliceTransform { return separator(key) != nullptr; } - bool InRange(const Slice& /*key*/) const override { return false; } - private: const char* separator(const Slice& key) const { return static_cast(memchr(key.data(), '_', key.size())); @@ -339,6 +337,135 @@ TEST_F(DBMemTableTest, ColumnFamilyId) { } } +class DBMemTableTestForSeek : public DBMemTableTest, + virtual public ::testing::WithParamInterface< + std::tuple> {}; + +TEST_P(DBMemTableTestForSeek, IntegrityChecks) { + // Validate key corruption could be detected during seek. + // We insert many keys into skiplist. Then we corrupt the each key one at a + // time. With memtable_veirfy_per_key_checksum_on_seek enabled, when the + // corrupted key is searched, the checksum of every key visited during the + // seek is validated. It will report data corruption. Otherwise seek returns + // not found. + auto allow_data_in_error = std::get<0>(GetParam()); + Options options = CurrentOptions(); + options.allow_data_in_errors = allow_data_in_error; + options.paranoid_memory_checks = std::get<1>(GetParam()); + options.memtable_veirfy_per_key_checksum_on_seek = std::get<2>(GetParam()); + options.memtable_protection_bytes_per_key = 8; + DestroyAndReopen(options); + + // capture the data pointer of all of the keys + std::vector raw_data_pointer; + + // Insert enough keys, so memtable would create multiple levels. + auto key_count = 100; + for (int i = 0; i < key_count; i++) { + // The last digit of the key will be corrupted from value 0 to value 5 + ASSERT_OK(Put(Key(i * 10), "val0")); + } + + ReadOptions rops; + + // Iterate all the keys to get key pointers + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack("InlineSkipList::Iterator::Next::key", + [&raw_data_pointer](void* key) { + auto p = static_cast(key); + raw_data_pointer.push_back(p); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + { + std::unique_ptr iter{db_->NewIterator(rops)}; + iter->Seek(Key(0)); + while (iter->Valid()) { + ASSERT_OK(iter->status()); + iter->Next(); + } + // check status after valid returned false. + auto status = iter->status(); + ASSERT_TRUE(status.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(raw_data_pointer.size(), key_count); + + bool enable_key_validation_on_seek = + options.memtable_veirfy_per_key_checksum_on_seek; + + // For each key, corrupt it, validate corruption is detected correctly, then + // revert it. + for (int i = 0; i < key_count; i++) { + std::string key_to_corrupt = Key(i * 10); + raw_data_pointer[i][key_to_corrupt.size()] = '5'; + + auto corrupted_key = key_to_corrupt; + corrupted_key.data()[key_to_corrupt.size() - 1] = '5'; + auto corrupted_key_slice = + Slice(corrupted_key.data(), corrupted_key.length()); + auto corrupted_key_hex = corrupted_key_slice.ToString(/*hex=*/true); + + { + // Test Get API + std::string val; + auto status = db_->Get(rops, key_to_corrupt, &val); + if (enable_key_validation_on_seek) { + ASSERT_TRUE(status.IsCorruption()) << key_to_corrupt; + ASSERT_EQ( + status.ToString().find(corrupted_key_hex) != std::string::npos, + allow_data_in_error) + << status.ToString() << "\n" + << corrupted_key_hex; + } else { + ASSERT_TRUE(status.IsNotFound()); + } + } + + { + // Test MultiGet API + std::vector vals; + std::vector statuses = db_->MultiGet( + rops, {db_->DefaultColumnFamily()}, {key_to_corrupt}, &vals, nullptr); + if (enable_key_validation_on_seek) { + ASSERT_TRUE(statuses[0].IsCorruption()); + ASSERT_EQ( + statuses[0].ToString().find(corrupted_key_hex) != std::string::npos, + allow_data_in_error); + } else { + ASSERT_TRUE(statuses[0].IsNotFound()); + } + } + + { + // Test Iterator Seek API + std::unique_ptr iter{db_->NewIterator(rops)}; + ASSERT_OK(iter->status()); + iter->Seek(key_to_corrupt); + auto status = iter->status(); + if (enable_key_validation_on_seek) { + ASSERT_TRUE(status.IsCorruption()); + ASSERT_EQ( + status.ToString().find(corrupted_key_hex) != std::string::npos, + allow_data_in_error); + } else { + ASSERT_FALSE(iter->Valid()); + ASSERT_FALSE(status.ok()); + } + } + + // revert the key corruption. + raw_data_pointer[i][key_to_corrupt.size()] = '0'; + } +} + +INSTANTIATE_TEST_CASE_P(DBMemTableTestForSeek, DBMemTableTestForSeek, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool())); + TEST_F(DBMemTableTest, IntegrityChecks) { // We insert keys key000000, key000001 and key000002 into skiplist at fixed // height 1 (smallest height). Then we corrupt the second key to aey000001 to @@ -424,6 +551,96 @@ TEST_F(DBMemTableTest, IntegrityChecks) { ASSERT_FALSE(iter->Valid()); } } + +TEST_F(DBMemTableTest, VectorConcurrentInsert) { + Options options; + options.create_if_missing = true; + options.create_missing_column_families = true; + options.allow_concurrent_memtable_write = true; + options.memtable_factory.reset(new VectorRepFactory()); + DestroyAndReopen(options); + CreateAndReopenWithCF({"cf1"}, options); + + // Multi-threaded writes + { + WriteOptions write_options; + std::vector threads; + for (int i = 0; i < 10; ++i) { + threads.emplace_back([&, i]() { + int start = i * 100; + int end = start + 100; + WriteBatch batch; + for (int j = start; j < end; ++j) { + ASSERT_OK( + batch.Put(handles_[0], Key(j), "value" + std::to_string(j))); + } + ASSERT_OK(db_->Write(write_options, &batch)); + }); + } + for (auto& t : threads) { + t.join(); + } + + std::unique_ptr iter( + db_->NewIterator(ReadOptions(), handles_[0])); + iter->SeekToFirst(); + for (int i = 0; i < 1000; ++i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), Key(i)); + ASSERT_EQ(iter->value().ToString(), "value" + std::to_string(i)); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + // Multi-threaded writes, multi CF + { + WriteOptions write_options; + std::vector threads; + for (int i = 0; i < 10; ++i) { + threads.emplace_back([&, i]() { + int start = i * 100; + int end = start + 100; + WriteBatch batch; + for (int j = start; j < end; ++j) { + ASSERT_OK(batch.Put(handles_[0], Key(j), "CF0" + std::to_string(j))); + ASSERT_OK(batch.Put(handles_[1], Key(j), "CF1" + std::to_string(j))); + } + ASSERT_OK(db_->Write(write_options, &batch)); + }); + } + + for (auto& t : threads) { + t.join(); + } + + std::unique_ptr iter0( + db_->NewIterator(ReadOptions(), handles_[0])); + std::unique_ptr iter1( + db_->NewIterator(ReadOptions(), handles_[1])); + iter0->SeekToFirst(); + iter1->SeekToFirst(); + for (int i = 0; i < 1000; ++i) { + ASSERT_TRUE(iter0->Valid()); + ASSERT_EQ(iter0->key().ToString(), Key(i)); + ASSERT_EQ(iter0->value().ToString(), "CF0" + std::to_string(i)); + iter0->Next(); + + ASSERT_TRUE(iter1->Valid()); + ASSERT_EQ(iter1->key().ToString(), Key(i)); + ASSERT_EQ(iter1->value().ToString(), "CF1" + std::to_string(i)); + iter1->Next(); + } + ASSERT_FALSE(iter0->Valid()); + ASSERT_OK(iter0->status()); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(iter1->status()); + } + + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc index 69f6ec4e9185..143203fd7b7e 100644 --- a/db/db_merge_operator_test.cc +++ b/db/db_merge_operator_test.cc @@ -386,7 +386,7 @@ TEST_F(DBMergeOperatorTest, MergeOperandThresholdExceeded) { snapshots.reserve(3); for (size_t i = 0; i < keys.size(); ++i) { - snapshots.emplace_back(db_); + snapshots.emplace_back(db_.get()); const std::string suffix = std::to_string(i + 1); @@ -971,7 +971,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) { // No base value { - constexpr char key[] = "key1"; + const std::string key = "key1"; ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, foo)); ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar)); @@ -985,7 +985,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) { // max_successive_merges. constexpr size_t max_key_versions = 8; std::vector key_versions; - ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key, + ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key, max_key_versions, &key_versions)); ASSERT_EQ(key_versions.size(), 2); ASSERT_EQ(key_versions[0].type, kTypeValue); @@ -994,7 +994,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) { // Plain base value { - constexpr char key[] = "key2"; + const std::string key = "key2"; ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), key, foo)); ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar)); @@ -1009,7 +1009,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) { // max_successive_merges. constexpr size_t max_key_versions = 8; std::vector key_versions; - ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key, + ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key, max_key_versions, &key_versions)); ASSERT_EQ(key_versions.size(), 3); ASSERT_EQ(key_versions[0].type, kTypeValue); @@ -1019,7 +1019,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) { // Wide-column base value { - constexpr char key[] = "key3"; + const std::string key = "key3"; const WideColumns columns{{kDefaultWideColumnName, foo}, {bar, baz}}; ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), key, @@ -1038,7 +1038,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) { // max_successive_merges. constexpr size_t max_key_versions = 8; std::vector key_versions; - ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key, + ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key, max_key_versions, &key_versions)); ASSERT_EQ(key_versions.size(), 3); ASSERT_EQ(key_versions[0].type, kTypeWideColumnEntity); diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 99d390db2399..07e5d27f23e8 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -70,7 +70,8 @@ class DBOptionsTest : public DBTestBase { options.env = env_; ImmutableDBOptions db_options(options); test::RandomInitCFOptions(&options, options, rnd); - auto sanitized_options = SanitizeOptions(db_options, options); + auto sanitized_options = + SanitizeCfOptions(db_options, /*read_only*/ false, options); auto opt_map = GetMutableCFOptionsMap(sanitized_options); delete options.compaction_filter; return opt_map; @@ -321,31 +322,26 @@ TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) { } Options options; options.create_if_missing = true; - // Try with fail_if_options_file_error=false/true to update the options - for (bool on_error : {false, true}) { - options.fail_if_options_file_error = on_error; - options.env = env_; - options.disable_auto_compactions = false; + options.env = env_; + options.disable_auto_compactions = false; - options.memtable_factory.reset(new DummySkipListFactory()); - Reopen(options); + options.memtable_factory.reset(new DummySkipListFactory()); + Reopen(options); - ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); - ASSERT_OK( - dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}})); - ColumnFamilyDescriptor cfd; - ASSERT_OK(cfh->GetDescriptor(&cfd)); - ASSERT_STREQ(cfd.options.memtable_factory->Name(), - DummySkipListFactory::kClassName()); - ColumnFamilyHandle* test = nullptr; - ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test)); - ASSERT_OK(test->GetDescriptor(&cfd)); - ASSERT_STREQ(cfd.options.memtable_factory->Name(), - DummySkipListFactory::kClassName()); - - ASSERT_OK(dbfull()->DropColumnFamily(test)); - delete test; - } + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + ASSERT_OK(dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}})); + ColumnFamilyDescriptor cfd; + ASSERT_OK(cfh->GetDescriptor(&cfd)); + ASSERT_STREQ(cfd.options.memtable_factory->Name(), + DummySkipListFactory::kClassName()); + ColumnFamilyHandle* test = nullptr; + ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test)); + ASSERT_OK(test->GetDescriptor(&cfd)); + ASSERT_STREQ(cfd.options.memtable_factory->Name(), + DummySkipListFactory::kClassName()); + + ASSERT_OK(dbfull()->DropColumnFamily(test)); + delete test; } TEST_F(DBOptionsTest, SetBytesPerSync) { @@ -436,12 +432,47 @@ TEST_F(DBOptionsTest, SetWalBytesPerSync) { ASSERT_GT(low_bytes_per_sync, counter); } +TEST_F(DBOptionsTest, MutableManifestOptions) { + // These aren't end-to-end tests, but sufficient to ensure the VersionSet + // receives the updates with SetDBOptions + for (int64_t i : {0, 1, 100, 100000, 10000000}) { + ASSERT_OK( + db_->SetDBOptions({{"max_manifest_file_size", std::to_string(i)}})); + ASSERT_EQ(i, + static_cast(db_->GetDBOptions().max_manifest_file_size)); + ASSERT_EQ(i, + static_cast( + dbfull()->GetVersionSet()->TEST_GetMinMaxManifestFileSize())); + if (i > 1) { + ++i; + } + ASSERT_OK( + db_->SetDBOptions({{"max_manifest_space_amp_pct", std::to_string(i)}})); + ASSERT_EQ(i, static_cast( + db_->GetDBOptions().max_manifest_space_amp_pct)); + ASSERT_EQ(i, + static_cast( + dbfull()->GetVersionSet()->TEST_GetMaxManifestSpaceAmpPct())); + if (i > 1) { + ++i; + } + ASSERT_OK(db_->SetDBOptions( + {{"manifest_preallocation_size", std::to_string(i)}})); + ASSERT_EQ(i, static_cast( + db_->GetDBOptions().manifest_preallocation_size)); + ASSERT_EQ( + i, static_cast( + dbfull()->GetVersionSet()->TEST_GetManifestPreallocationSize())); + } +} + TEST_F(DBOptionsTest, WritableFileMaxBufferSize) { Options options; options.create_if_missing = true; options.writable_file_max_buffer_size = 1024 * 1024; options.level0_file_num_compaction_trigger = 3; options.max_manifest_file_size = 1; + options.max_manifest_space_amp_pct = 0; options.env = env_; int buffer_size = 1024 * 1024; Reopen(options); @@ -1658,6 +1689,46 @@ TEST_F(DBOptionsTest, SetOptionsNoManifestWrite) { ASSERT_EQ(Get("x"), "x"); } +TEST_F(DBOptionsTest, SetOptionsMultipleColumnFamilies) { + Options options; + options.create_if_missing = true; + options.env = CurrentOptions().env; + options.disable_auto_compactions = true; + Reopen(options); + + // Create two additional column families + CreateColumnFamilies({"cf1", "cf2"}, options); + ReopenWithColumnFamilies({"default", "cf1", "cf2"}, options); + + // Verify initial state - auto compaction should be disabled + ASSERT_TRUE(dbfull()->GetOptions(handles_[0]).disable_auto_compactions); + ASSERT_TRUE(dbfull()->GetOptions(handles_[1]).disable_auto_compactions); + ASSERT_TRUE(dbfull()->GetOptions(handles_[2]).disable_auto_compactions); + + // Set options on multiple column families at once + ASSERT_OK(dbfull()->SetOptions({handles_[1], handles_[2]}, + {{"disable_auto_compactions", "false"}})); + + ASSERT_TRUE( + dbfull()->GetOptions(handles_[0]).disable_auto_compactions); // unchanged + ASSERT_FALSE( + dbfull()->GetOptions(handles_[1]).disable_auto_compactions); // changed + ASSERT_FALSE( + dbfull()->GetOptions(handles_[2]).disable_auto_compactions); // changed + + std::unordered_map> + options_map; + options_map[handles_[0]] = {{"disable_auto_compactions", "false"}}; + options_map[handles_[1]] = {{"disable_auto_compactions", "true"}}; + options_map[handles_[2]] = {{"disable_auto_compactions", "true"}}; + ASSERT_OK(dbfull()->SetOptions(options_map)); + + ASSERT_FALSE(dbfull()->GetOptions(handles_[0]).disable_auto_compactions); + ASSERT_TRUE(dbfull()->GetOptions(handles_[1]).disable_auto_compactions); + ASSERT_TRUE(dbfull()->GetOptions(handles_[2]).disable_auto_compactions); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 01ab37e21ebf..523abeb1cbd6 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -377,12 +377,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { NewBloomFilterPolicy(kBloomBitsPerKey, false)); table_options.block_size = 1024; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + // The checks assume kTableCount number of files + options.disable_auto_compactions = true; DestroyAndReopen(options); // Hold open a snapshot to prevent range tombstones from being compacted // away. - ManagedSnapshot snapshot(db_); + ManagedSnapshot snapshot(db_.get()); Random rnd(5632); for (int table = 1; table <= kTableCount; ++table) { @@ -567,7 +569,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { options.target_file_size_base = 8192; options.max_bytes_for_level_base = 10000; options.max_bytes_for_level_multiplier = 2; - // This ensures there no compaction happening when we call GetProperty(). + // The checks assume kTableCount number of files options.disable_auto_compactions = true; options.merge_operator.reset(new TestPutOperator()); @@ -580,7 +582,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { DestroyAndReopen(options); // Hold open a snapshot to prevent range tombstones from being compacted away. - ManagedSnapshot snapshot(db_); + ManagedSnapshot snapshot(db_.get()); std::string level_tp_strings[kMaxLevel]; std::string tp_string; @@ -1517,16 +1519,14 @@ TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) { // Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage. TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) { - // Sampled compression requires at least one of the following four types. - if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() && - !ZSTD_Supported()) { - return; - } - Options options = CurrentOptions(); options.disable_auto_compactions = true; options.table_properties_collector_factories.emplace_back( std::make_shared()); + options.compression = kNoCompression; + + bool fast_sampling_supported = Snappy_Supported() || LZ4_Supported(); + bool slow_sampling_supported = ZSTD_Supported() || Zlib_Supported(); for (bool sample_for_compression : {false, true}) { // For simplicity/determinism, sample 100% when enabled, or 0% when disabled @@ -1540,10 +1540,11 @@ TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) { // L1_0 ["a", "b"] // // L0_0 was created by flush. L1_0 was created by compaction. Each file - // contains one data block. + // contains one data block with enough data to be compressible. for (int i = 0; i < 3; ++i) { - ASSERT_OK(Put("a", "val")); - ASSERT_OK(Put("b", "val")); + for (int j = 0; j < 50; ++j) { + ASSERT_OK(Put(std::to_string(j), "thisismyvalue")); + } ASSERT_OK(Flush()); if (i == 1) { ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -1556,13 +1557,33 @@ TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) { ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props)); ASSERT_EQ(2, file_to_props.size()); for (const auto& file_and_props : file_to_props) { - auto& user_props = file_and_props.second->user_collected_properties; + auto& props = *file_and_props.second; + auto& user_props = props.user_collected_properties; ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector:: kNumSampledBlocksPropertyName) != user_props.end()); ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector:: kNumSampledBlocksPropertyName), std::to_string(sample_for_compression ? 1 : 0)); + if (sample_for_compression) { + EXPECT_GT(props.fast_compression_estimated_data_size, 0); + EXPECT_GT(props.slow_compression_estimated_data_size, 0); + if (fast_sampling_supported) { + EXPECT_LT(props.fast_compression_estimated_data_size, + props.data_size); + if (slow_sampling_supported) { + EXPECT_LT(props.slow_compression_estimated_data_size, + props.fast_compression_estimated_data_size); + } + } + if (slow_sampling_supported) { + EXPECT_LT(props.slow_compression_estimated_data_size, + props.data_size); + } + } else { + EXPECT_EQ(props.fast_compression_estimated_data_size, 0); + EXPECT_EQ(props.slow_compression_estimated_data_size, 0); + } } } } @@ -1843,7 +1864,7 @@ TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) { options.listeners.push_back(listener); options.level0_file_num_compaction_trigger = kNumL0Files; DestroyAndReopen(options); - listener->SetDB(db_); + listener->SetDB(db_.get()); for (int i = 0; i < kNumL0Files; ++i) { // Make sure they overlap in keyspace to prevent trivial move diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 5122aedc97a3..f0996ce34c94 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -2047,7 +2047,7 @@ TEST_F(DBRangeDelTest, IteratorReseek) { // Immutable memtable ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), Key(2))); - ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); std::string value; ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(), "rocksdb.num-immutable-mem-table", &value)); @@ -3825,6 +3825,89 @@ TEST_F(DBRangeDelTest, RowCache) { // and should not turn db into read-only mdoe. ASSERT_OK(Put(Key(5), "foo")); } + +TEST_F(DBRangeDelTest, SeekForPrevTest) { + // open db + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.compaction_style = kCompactionStyleUniversal; + + // add SST partitioner, split sst file with prefix length 2 + options.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(2); + Reopen(options); + + // File uses SST partitioner, so it will be split into 3 files + // SST file 1: ka1, ka2 + // SST file 2: kb1 + // SST file 3: kc1, kc2 + // Delete range covers from ka2 to kc2, which means record ka2 and kb1, kc1 + // are covered by the delete range + + std::vector> kv = {{"ka1", "value_1"}, + {"ka2", "value_2"}, + {"kb1", "value_3"}, + {"kc1", "value_4"}, + {"kc2", "value_5"}}; + for (auto& p : kv) { + ASSERT_OK(Put(p.first, p.second)); + } + + ASSERT_OK(Flush()); + // Compact to Lmax, it should have seq 0 now. + ASSERT_OK(CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Open an iterator and create a snapshot, so that keys are not deleted + // completely by delete range in SST + ReadOptions read_opts; + read_opts.snapshot = db_->GetSnapshot(); + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + // iterate all the keys and validate the value + for (int i = 0; iter->Valid(); iter->Next()) { + ASSERT_EQ(kv[i].first, iter->key().ToString()); + ASSERT_EQ(kv[i].second, iter->value().ToString()); + i++; + } + + // use delete range to delete the record + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "ka2", + "kc2")); + // Flush + ASSERT_OK(Flush()); + // Compact to Lmax + ASSERT_OK(CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Close the iterator and release the snapshot. + ASSERT_OK(iter->status()); + iter.reset(); + db_->ReleaseSnapshot(read_opts.snapshot); + + // create second iterator, seek each key and validate result + std::unique_ptr iter2(db_->NewIterator(ReadOptions())); + // Validate keys are deleted + iter2->SeekToFirst(); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("ka1", iter2->key().ToString()); + iter2->Next(); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("kc2", iter2->key().ToString()); + iter2->Next(); + ASSERT_FALSE(iter2->Valid()); + + // Validate seek for prev result + for (auto& p : kv) { + iter2->SeekForPrev(p.first); + ASSERT_TRUE(iter2->Valid()); + if (p.first == "kc2") { + ASSERT_EQ("kc2", iter2->key().ToString()); + } else { + ASSERT_EQ("ka1", iter2->key().ToString()); + } + } + ASSERT_OK(iter2->status()); + iter2.reset(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_rate_limiter_test.cc b/db/db_rate_limiter_test.cc index b28055225a0f..210e3c49ac32 100644 --- a/db/db_rate_limiter_test.cc +++ b/db/db_rate_limiter_test.cc @@ -442,6 +442,107 @@ TEST_P(DBRateLimiterOnWriteWALTest, AutoWalFlush) { EXPECT_EQ(actual_auto_wal_flush_request, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); } + +class DBRateLimiterOnManualWALFlushTest + : public DBRateLimiterOnWriteTest, + public ::testing::WithParamInterface { + public: + static std::string GetTestNameSuffix( + ::testing::TestParamInfo info) { + std::ostringstream oss; + if (info.param == Env::IO_USER) { + oss << "RateLimitManualWALFlush"; + } else if (info.param == Env::IO_TOTAL) { + oss << "NoRateLimitManualWALFlush"; + } else if (info.param == Env::IO_HIGH) { + oss << "RateLimitManualWALFlushWithHighPriority"; + } else { + oss << "RateLimitManualWALFlushWithLowPriority"; + } + return oss.str(); + } + + explicit DBRateLimiterOnManualWALFlushTest() + : rate_limiter_priority_(GetParam()) {} + + void Init() { + options_ = GetOptions(); + // Enable manual WAL flush mode + options_.manual_wal_flush = true; + Reopen(options_); + } + + WriteOptions GetWriteOptions() { + WriteOptions write_options; + // WAL must be enabled for manual WAL flush to work + write_options.disableWAL = false; + // In manual WAL flush mode, WAL write rate limiting should be done through + // FlushWAL(), not WriteOptions::rate_limiter_priority + write_options.rate_limiter_priority = Env::IO_TOTAL; + return write_options; + } + + protected: + Env::IOPriority rate_limiter_priority_; +}; + +INSTANTIATE_TEST_CASE_P(DBRateLimiterOnManualWALFlushTest, + DBRateLimiterOnManualWALFlushTest, + ::testing::Values(Env::IO_TOTAL, Env::IO_USER, + Env::IO_HIGH, Env::IO_LOW), + DBRateLimiterOnManualWALFlushTest::GetTestNameSuffix); + +TEST_P(DBRateLimiterOnManualWALFlushTest, ManualWALFlush) { + Init(); + + const bool no_rate_limit = (rate_limiter_priority_ == Env::IO_TOTAL); + + ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL)); + + for (bool sync : {false, true}) { + std::int64_t prev_total_request = + options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL); + + Status put_status = Put("key_" + std::to_string(sync), + "value_" + std::to_string(sync), GetWriteOptions()); + + EXPECT_TRUE(put_status.ok()); + + // Since manual_wal_flush is enabled and write_options.rate_limiter_priority + // is IO_TOTAL, no rate limiting should have occurred for this user write + EXPECT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) - + prev_total_request); + + // Now explicitly flush the WAL with the test's rate_limiter_priority + prev_total_request = options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL); + std::int64_t prev_priority_request = + options_.rate_limiter->GetTotalRequests(rate_limiter_priority_); + + FlushWALOptions flush_options; + flush_options.sync = sync; + flush_options.rate_limiter_priority = rate_limiter_priority_; + Status flush_status = db_->FlushWAL(flush_options); + + EXPECT_TRUE(flush_status.ok()); + + std::int64_t manual_wal_flush_requests_total = + options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) - + prev_total_request; + std::int64_t manual_wal_flush_requests_for_priority = + options_.rate_limiter->GetTotalRequests(rate_limiter_priority_) - + prev_priority_request; + + if (no_rate_limit) { + EXPECT_EQ(0, manual_wal_flush_requests_total); + EXPECT_EQ(0, manual_wal_flush_requests_for_priority); + } else { + EXPECT_EQ(manual_wal_flush_requests_total, + manual_wal_flush_requests_for_priority); + EXPECT_GT(manual_wal_flush_requests_for_priority, 0); + } + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_readonly_with_timestamp_test.cc b/db/db_readonly_with_timestamp_test.cc index 7a37bfec81c5..6fbc43bb2664 100644 --- a/db/db_readonly_with_timestamp_test.cc +++ b/db/db_readonly_with_timestamp_test.cc @@ -237,7 +237,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { it->Next(), ++count, ++key) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); - get_value_and_check(db_, read_opts, it->key(), it->value(), + get_value_and_check(db_.get(), read_opts, it->key(), it->value(), write_timestamps[i]); } ASSERT_OK(it->status()); @@ -250,7 +250,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { it->Prev(), ++count, --key) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); - get_value_and_check(db_, read_opts, it->key(), it->value(), + get_value_and_check(db_.get(), read_opts, it->key(), it->value(), write_timestamps[i]); } ASSERT_OK(it->status()); @@ -272,7 +272,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { it->Valid(); it->Next(), ++key, ++count) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); - get_value_and_check(db_, read_opts, it->key(), it->value(), + get_value_and_check(db_.get(), read_opts, it->key(), it->value(), write_timestamps[i]); } ASSERT_OK(it->status()); @@ -282,7 +282,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { it->Valid(); it->Prev(), --key, ++count) { CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); - get_value_and_check(db_, read_opts, it->key(), it->value(), + get_value_and_check(db_.get(), read_opts, it->key(), it->value(), write_timestamps[i]); } ASSERT_OK(it->status()); diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 060ce8644087..a5da2afacc44 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -56,12 +56,11 @@ class DBSecondaryTestBase : public DBBasicTestWithTimestampBase { ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h)); } handles_secondary_.clear(); - delete db_secondary_; - db_secondary_ = nullptr; + db_secondary_.reset(); } DBImplSecondary* db_secondary_full() { - return static_cast(db_secondary_); + return static_cast(db_secondary_.get()); } void CheckFileTypeCounts(const std::string& dir, int expected_log, @@ -69,7 +68,7 @@ class DBSecondaryTestBase : public DBBasicTestWithTimestampBase { std::string secondary_path_; std::vector handles_secondary_; - DB* db_secondary_; + std::unique_ptr db_secondary_; }; void DBSecondaryTestBase::OpenSecondary(const Options& options) { @@ -152,14 +151,15 @@ TEST_F(DBSecondaryTest, NonExistingDb) { options.env = env_; options.max_open_files = -1; const std::string dbname = "/doesnt/exist"; - Status s = - DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_); + std::unique_ptr dbptr; + Status s = DB::OpenAsSecondary(options, dbname, secondary_path_, &dbptr); ASSERT_TRUE(s.IsIOError()); } TEST_F(DBSecondaryTest, ReopenAsSecondary) { Options options; options.env = env_; + options.preserve_internal_time_seconds = 300; Reopen(options); ASSERT_OK(Put("foo", "foo_value")); ASSERT_OK(Put("bar", "bar_value")); @@ -181,7 +181,7 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) { ReadOptions ropts; ropts.verify_checksums = true; - auto db1 = static_cast(db_); + auto db1 = static_cast(db_.get()); ASSERT_NE(nullptr, db1); Iterator* iter = db1->NewIterator(ropts); ASSERT_NE(nullptr, iter); @@ -507,6 +507,81 @@ TEST_F(DBSecondaryTest, OpenAsSecondary) { verify_db_func("new_foo_value", "new_bar_value"); } +TEST_F(DBSecondaryTest, OptionsOverrideTest) { + Options options; + options.env = env_; + options.preserve_internal_time_seconds = 300; + options.compaction_readahead_size = 200; + options.blob_compaction_readahead_size = 100; + Reopen(options); + + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + input.options_file_number = dbfull()->GetVersionSet()->options_file_number(); + input.cf_name = kDefaultColumnFamilyName; + ASSERT_OK(db_->GetDbIdentity(input.db_id)); + + ASSERT_EQ(db_->GetOptions().compaction_readahead_size, 200); + ASSERT_EQ(db_->GetOptions().blob_compaction_readahead_size, 100); + + Close(); + + std::string compaction_input_binary; + ASSERT_OK(input.Write(&compaction_input_binary)); + std::string compaction_result_binary; + + CompactionServiceOptionsOverride override_options; + override_options.env = env_; + override_options.table_factory.reset( + NewBlockBasedTableFactory(BlockBasedTableOptions())); + + ASSERT_OK( + StringToMap("compaction_readahead_size=8388608;" + "blob_compaction_readahead_size=4194304;" + "some_invalid_option=ignore_me;" + "env=this_should_not_fail;" + "max_open_files=100;", // this should be always overriden as + // -1 in remote compaction + &override_options.options_map)); + + bool verified = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::OpenAndCompact::AfterOpenAsSecondary:0", + [&](void* arg) { + auto secondary_db = static_cast(arg); + auto secondary_db_options = secondary_db->GetOptions(); + // DBOption + ASSERT_EQ(secondary_db_options.compaction_readahead_size, 8388608); + ASSERT_EQ(secondary_db_options.max_open_files, -1); + // CFOption + ASSERT_EQ(secondary_db_options.blob_compaction_readahead_size, 4194304); + verified = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(DB::OpenAndCompact(OpenAndCompactOptions(), dbname_, + secondary_path_, compaction_input_binary, + &compaction_result_binary, override_options)); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_TRUE(verified); +} + namespace { class TraceFileEnv : public EnvWrapper { public: @@ -529,6 +604,9 @@ class TraceFileEnv : public EnvWrapper { char* scratch) const override { return target_->Read(offset, n, result, scratch); } + Status GetFileSize(uint64_t* file_size) override { + return target_->GetFileSize(file_size); + } private: std::unique_ptr target_; @@ -755,7 +833,7 @@ TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { options1.max_open_files = -1; OpenSecondary(options1); ASSERT_EQ(0, handles_secondary_.size()); - ASSERT_NE(nullptr, db_secondary_); + ASSERT_NE(nullptr, db_secondary_.get()); ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value")); ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value")); @@ -1073,7 +1151,7 @@ TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) { for (int k = 0; k != 16; ++k) { ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k))); ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db(dbfull(), db_secondary_); + verify_db(dbfull(), db_secondary_.get()); } } @@ -1142,7 +1220,7 @@ TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) { TEST_SYNC_POINT( "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); + verify_db(dbfull(), handles_, db_secondary_.get(), handles_secondary_); SyncPoint::GetInstance()->ClearTrace(); } } @@ -1215,46 +1293,6 @@ TEST_F(DBSecondaryTest, CatchUpAfterFlush) { ASSERT_OK(iter3->status()); } -TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { - bool called = false; - Options options; - options.env = env_; - options.disable_auto_compactions = true; - Reopen(options); - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->SetCallBack( - "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { - ASSERT_NE(nullptr, arg); - called = true; - auto* s = static_cast(arg); - ASSERT_NOK(*s); - }); - SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData", - "BackgroundCallCompaction:0"}, - {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", - "DBImpl::CheckConsistency:BeforeGetFileSize"}}); - SyncPoint::GetInstance()->EnableProcessing(); - - ASSERT_OK(Put("a", "value0")); - ASSERT_OK(Put("c", "value0")); - ASSERT_OK(Flush()); - ASSERT_OK(Put("b", "value1")); - ASSERT_OK(Put("d", "value1")); - ASSERT_OK(Flush()); - port::Thread thread([this]() { - Options opts; - opts.env = env_; - opts.max_open_files = -1; - OpenSecondary(opts); - }); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - thread.join(); - ASSERT_TRUE(called); -} - TEST_F(DBSecondaryTest, StartFromInconsistent) { Options options = CurrentOptions(); DestroyAndReopen(options); @@ -1318,7 +1356,7 @@ TEST_F(DBSecondaryTest, OpenWithTransactionDB) { TransactionDBOptions txn_db_opts; ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); ASSERT_NE(txn_db, nullptr); - db_ = txn_db; + db_.reset(txn_db); std::vector cfs = {"new_CF"}; CreateColumnFamilies(cfs, options); @@ -1522,7 +1560,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { it->Next(), ++count, ++key) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); - get_value_and_check(db_, read_opts, it->key(), it->value(), + get_value_and_check(db_.get(), read_opts, it->key(), it->value(), write_timestamps[i]); } ASSERT_OK(it->status()); @@ -1535,7 +1573,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { it->Prev(), ++count, --key) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); - get_value_and_check(db_, read_opts, it->key(), it->value(), + get_value_and_check(db_.get(), read_opts, it->key(), it->value(), write_timestamps[i]); } ASSERT_OK(it->status()); @@ -1557,7 +1595,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { it->Valid(); it->Next(), ++key, ++count) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); - get_value_and_check(db_, read_opts, it->key(), it->value(), + get_value_and_check(db_.get(), read_opts, it->key(), it->value(), write_timestamps[i]); } ASSERT_OK(it->status()); @@ -1567,7 +1605,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { it->Valid(); it->Prev(), --key, ++count) { CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); - get_value_and_check(db_, read_opts, it->key(), it->value(), + get_value_and_check(db_.get(), read_opts, it->key(), it->value(), write_timestamps[i]); } ASSERT_OK(it->status()); diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 71511cee7420..d186efd8c600 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -135,21 +135,6 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) { Destroy(options); } -// Check that we don't crash when opening DB with -// DBOptions::skip_checking_sst_file_sizes_on_db_open = true. -TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) { - ASSERT_OK(Put("pika", "choo")); - ASSERT_OK(Flush()); - - // Just open the DB with the option set to true and check that we don't crash. - Options options; - options.env = env_; - options.skip_checking_sst_file_sizes_on_db_open = true; - Reopen(options); - - ASSERT_EQ("choo", Get("pika")); -} - TEST_F(DBSSTTest, DontDeleteMovedFile) { // This test triggers move compaction and verifies that the file is not // deleted when it's part of move compaction @@ -1748,45 +1733,6 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(DBSSTTest, OpenDBWithoutGetFileSizeInvocations) { - Options options = CurrentOptions(); - std::unique_ptr env{MockEnv::Create(Env::Default())}; - options.env = env.get(); - options.disable_auto_compactions = true; - options.compression = kNoCompression; - options.enable_blob_files = true; - options.blob_file_size = 32; // create one blob per file - options.skip_checking_sst_file_sizes_on_db_open = true; - - DestroyAndReopen(options); - // Generate 5 files in L0 - for (int i = 0; i < 5; i++) { - for (int j = 0; j < 10; j++) { - std::string val = "val_file_" + std::to_string(i); - ASSERT_OK(Put(Key(j), val)); - } - ASSERT_OK(Flush()); - } - Close(); - - bool is_get_file_size_called = false; - SyncPoint::GetInstance()->SetCallBack( - "MockFileSystem::GetFileSize:CheckFileType", [&](void* arg) { - std::string* filename = static_cast(arg); - if (filename->find(".blob") != std::string::npos) { - is_get_file_size_called = true; - } - }); - - SyncPoint::GetInstance()->EnableProcessing(); - Reopen(options); - ASSERT_FALSE(is_get_file_size_called); - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); - - Destroy(options); -} - TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { Options options = CurrentOptions(); options.disable_auto_compactions = true; @@ -1991,6 +1937,70 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } +TEST_F(DBSSTTest, SstGetFileSizeFails) { + // Build an SST file + ASSERT_OK(Put("x", "zaphod")); + ASSERT_OK(Flush()); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(1U, metadata.size()); + std::string filename = dbname_ + metadata[0].name; + + // Prepare for fault injection + std::shared_ptr fault_fs = + std::make_shared( + CurrentOptions().env->GetFileSystem()); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.env = fault_fs_env.get(); + options.paranoid_checks = false; // don't check file sizes on open + + for (int i = 0; i < 4; i++) { + SCOPED_TRACE("Iteration = " + std::to_string(i)); + fault_fs->SetFailRandomAccessGetFileSizeSst(false); + fault_fs->SetFailFilesystemGetFileSizeSst(false); + Close(); + + if (i == 1) { + // Just FSRandomAccessFile::GetFileSize fails, which should be worked + // around + fault_fs->SetFailRandomAccessGetFileSizeSst(true); + } else if (i == 2) { + // FileSystem::GetFileSize fails, which should be worked around if + // FSRandomAccessFile::GetFileSize is supported + fault_fs->SetFailFilesystemGetFileSizeSst(true); + } else if (i == 3) { + // Both GetFileSize APIs fail with an IOError + fault_fs->SetFailRandomAccessGetFileSizeSst(true); + fault_fs->SetFailFilesystemGetFileSizeSst(true); + } + + ASSERT_OK(TryReopen(options)); + std::string value; + Status get_status = db_->Get({}, "x", &value); + if (i < 2) { + ASSERT_OK(get_status); + } else if (i == 2) { + if (encrypted_env_) { + // Can't recover because RandomAccessFile::GetFileSize is not supported + // on EncryptedEnv + // Fail with propagated IOError. (Not Corruption nor NotSupported!) + ASSERT_EQ(get_status.code(), Status::Code::kIOError); + ASSERT_STREQ(get_status.getState(), "FileSystem::GetFileSize failed"); + } else { + // Never sees the FileSystem::GetFileSize failure + ASSERT_OK(get_status); + } + } else { + ASSERT_EQ(i, 3); + // Fail with propagated IOError. (Not Corruption nor NotSupported!) + ASSERT_EQ(get_status.code(), Status::Code::kIOError); + ASSERT_STREQ(get_status.getState(), "FileSystem::GetFileSize failed"); + } + } + Close(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc index 4fe3032e901c..91f9df57e92b 100644 --- a/db/db_statistics_test.cc +++ b/db/db_statistics_test.cc @@ -321,7 +321,7 @@ TEST_F(DBStatisticsTest, BytesWrittenStats) { options.enable_pipelined_write = enable_pipelined_write; ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); ASSERT_NE(txn_db, nullptr); - db_ = txn_db->GetBaseDB(); + db_.reset(txn_db); WriteOptions wopts; TransactionOptions txn_opts; @@ -351,8 +351,7 @@ TEST_F(DBStatisticsTest, BytesWrittenStats) { WriteBatchInternal::kHeader); // Cleanup - db_ = nullptr; - delete txn_db; + db_.reset(); } } diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index a899c03e2935..0f9e1327825c 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -69,14 +69,6 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { // Create 4 tables for (int table = 0; table < 4; ++table) { - // Use old meta name for table properties for one file - if (table == 3) { - SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) { - *static_cast(meta) = &kPropertiesBlockOldName; - }); - SyncPoint::GetInstance()->EnableProcessing(); - } // Build file for (int i = 0; i < 10 + table; ++i) { ASSERT_OK( @@ -84,7 +76,6 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { } ASSERT_OK(db_->Flush(FlushOptions())); } - SyncPoint::GetInstance()->DisableProcessing(); std::string original_session_id; ASSERT_OK(db_->GetDbSessionId(original_session_id)); @@ -99,7 +90,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { // Clear out auto-opened files dbfull()->TEST_table_cache()->EraseUnRefEntries(); ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U); - VerifyTableProperties(db_, 10 + 11 + 12 + 13); + VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13); // 2. Put two tables to table cache and Reopen(options); @@ -112,7 +103,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { Get(std::to_string(i * 100 + 0)); } - VerifyTableProperties(db_, 10 + 11 + 12 + 13); + VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13); // 3. Put all tables to table cache Reopen(options); @@ -120,7 +111,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { for (int i = 0; i < 4; ++i) { Get(std::to_string(i * 100 + 0)); } - VerifyTableProperties(db_, 10 + 11 + 12 + 13); + VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13); // 4. Try to read CORRUPT properties (a) directly from file, and (b) // through reader on Get @@ -169,10 +160,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(DBTablePropertiesTest, InvalidIgnored) { - // RocksDB versions 2.5 - 2.7 generate some properties that Block considers - // invalid in some way. This approximates that. - +TEST_F(DBTablePropertiesTest, InvalidReportedAsCorruption) { // Inject properties block data that Block considers invalid SyncPoint::GetInstance()->SetCallBack( "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", @@ -189,13 +177,10 @@ TEST_F(DBTablePropertiesTest, InvalidIgnored) { for (int i = 0; i < 10; ++i) { ASSERT_OK(db_->Put(WriteOptions(), std::to_string(i), "val")); } - ASSERT_OK(db_->Flush(FlushOptions())); + // Corrupted properties block should be detected and reported as corruption + ASSERT_TRUE(db_->Flush(FlushOptions()).IsCorruption()); SyncPoint::GetInstance()->DisableProcessing(); - - // Not crashing is good enough - TablePropertiesCollection props; - ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); } TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) { @@ -229,6 +214,56 @@ TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) { ASSERT_EQ(0.5, del_factory->GetDeletionRatio()); } +TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesByLevelTest) { + Random rnd(202); + Options options; + options.level_compaction_dynamic_level_bytes = false; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 40960; + options.max_bytes_for_level_multiplier = 4; + options.hard_pending_compaction_bytes_limit = 16 * 1024; + options.num_levels = 8; + options.env = env_; + + DestroyAndReopen(options); + + // build a decent LSM + for (int i = 0; i < 10000; i++) { + EXPECT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (NumTableFilesAtLevel(0) == 0) { + EXPECT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_->PauseBackgroundWork()); + + // Ensure that we have at least L0, L1 and L2 + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_GT(NumTableFilesAtLevel(2), 0); + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + std::vector> levels_props; + ASSERT_OK(db_->GetPropertiesOfTablesByLevel(db_->DefaultColumnFamily(), + &levels_props)); + for (int i = 0; i < 8; i++) { + const std::unique_ptr& level_props = + levels_props[i]; + ASSERT_EQ(level_props->size(), cf_meta.levels[i].files.size()); + } + + Close(); +} + // Test params: // 1) whether to enable user-defined timestamps class DBTablePropertiesInRangeTest : public DBTestBase, @@ -292,7 +327,7 @@ class DBTablePropertiesInRangeTest : public DBTestBase, keys.reserve(range_size * 2); for (auto& r : ranges) { auto [start, limit] = MaybeAddTimestampsToRange( - &r.start, &r.limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(), + r.start, r.limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(), /*exclusive_end=*/false); EXPECT_TRUE(start.has_value()); EXPECT_TRUE(limit.has_value()); @@ -737,6 +772,46 @@ TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) { } } +TEST_F(DBTablePropertiesTest, KeyLargestSmallestSeqno) { + ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1")); + ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2")); + ASSERT_OK(db_->Put(WriteOptions(), "key3", "value3")); + ASSERT_OK(db_->Flush(FlushOptions())); + + { + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(1U, props.size()); + + auto table_props = props.begin()->second; + + ASSERT_TRUE(table_props->HasKeyLargestSeqno()); + ASSERT_TRUE(table_props->HasKeySmallestSeqno()); + + ASSERT_EQ(table_props->key_largest_seqno, + table_props->key_smallest_seqno + 2); + ASSERT_GT(table_props->key_largest_seqno, 0U); + ASSERT_GT(table_props->key_smallest_seqno, 0U); + } + + // Becomes zero after compaction + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(1U, props.size()); + + auto table_props = props.begin()->second; + ASSERT_TRUE(table_props->HasKeyLargestSeqno()); + ASSERT_TRUE(table_props->HasKeySmallestSeqno()); + + ASSERT_EQ(table_props->key_largest_seqno, table_props->key_smallest_seqno); + ASSERT_EQ(table_props->key_largest_seqno, 0U); + } +} + INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest, ::testing::Values("kCompactionStyleLevel", "kCompactionStyleUniversal")); diff --git a/db/db_test.cc b/db/db_test.cc index e141e562afbd..9c0dc9fe326b 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -59,11 +59,13 @@ #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/compression.h" +#include "util/defer.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/rate_limiter_impl.h" @@ -102,7 +104,7 @@ TEST_F(DBTest, MockEnvTest) { Options options; options.create_if_missing = true; options.env = env.get(); - DB* db; + std::unique_ptr db; const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; @@ -130,7 +132,7 @@ TEST_F(DBTest, MockEnvTest) { ASSERT_OK(iterator->status()); delete iterator; - DBImpl* dbi = static_cast_with_check(db); + DBImpl* dbi = static_cast_with_check(db.get()); ASSERT_OK(dbi->TEST_FlushMemTable()); for (size_t i = 0; i < 3; ++i) { @@ -139,7 +141,122 @@ TEST_F(DBTest, MockEnvTest) { ASSERT_TRUE(res == vals[i]); } - delete db; + db.reset(); +} + +TEST_F(DBTest, RequestIdPlumbingTest) { + // test that request_id is passed to the filesystem, from + // ReadOptions to IODebugContext + Options options = CurrentOptions(); + options.env = env_; + + // Create a mock environment to capture IODebugContext during reads + IODebugContext dbgCopy; + const std::string* captured_request_id_dbg; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::Read:IODebugContext", [&](void* arg) { + IODebugContext* dbg = static_cast(arg); + if (dbg == nullptr) { + captured_request_id_dbg = nullptr; + } else { + captured_request_id_dbg = dbg->request_id; + // Test IODebugContext assignment operator + dbgCopy = *dbg; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("k1", "v1")); + ASSERT_OK(Flush()); + + // test request_id plumbing during a get + { + const std::string test_request_id = "test_request_id_123"; + ReadOptions read_opts; + read_opts.request_id = &test_request_id; + std::string value; + ASSERT_OK(db_->Get(read_opts, "k1", &value)); + + // Verify the request_id was propagated to the file system + ASSERT_NE(captured_request_id_dbg, nullptr); + ASSERT_EQ(*captured_request_id_dbg, test_request_id); + + ASSERT_NE(dbgCopy.request_id, nullptr); + ASSERT_NE(dbgCopy.request_id, captured_request_id_dbg); + ASSERT_EQ(*dbgCopy.request_id, test_request_id); + } + + captured_request_id_dbg = nullptr; + + // test request_id plumbing during iterator seek + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Flush()); + { + ReadOptions read_opts; + const std::string request_id = "test_request_id_456"; + read_opts.request_id = &request_id; + + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->Seek("k2"); + ASSERT_TRUE(iter->Valid()); + + // Verify the request_id was propagated to the file system + ASSERT_NE(captured_request_id_dbg, nullptr); + ASSERT_EQ(*captured_request_id_dbg, request_id); + + ASSERT_NE(dbgCopy.request_id, nullptr); + ASSERT_NE(dbgCopy.request_id, captured_request_id_dbg); + ASSERT_EQ(*dbgCopy.request_id, request_id); + + // Test IODebugContext copy constructor + IODebugContext dbgCopy2(dbgCopy); + ASSERT_NE(dbgCopy2.request_id, nullptr); + ASSERT_NE(dbgCopy2.request_id, captured_request_id_dbg); + ASSERT_NE(dbgCopy2.request_id, dbgCopy.request_id); + ASSERT_EQ(*dbgCopy2.request_id, request_id); + } + + // test request_id plumbing during multiget + captured_request_id_dbg = nullptr; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::MultiRead:IODebugContext", [&](void* arg) { + IODebugContext* dbg = static_cast(arg); + if (dbg == nullptr) { + captured_request_id_dbg = nullptr; + } else { + captured_request_id_dbg = dbg->request_id; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("k3", "v3")); + ASSERT_OK(Put("k4", "v4")); + ASSERT_OK(Flush()); + + { + ReadOptions read_opts; + const std::string multiget_request_id = "test_request_id_789"; + read_opts.request_id = &multiget_request_id; + + std::vector values; + std::vector keys = {Slice("k3"), Slice("k4")}; + + values.resize(keys.size()); + + std::vector cfhs(keys.size(), + db_->DefaultColumnFamily()); + db_->MultiGet(read_opts, cfhs, keys, &values); + + ASSERT_NE(captured_request_id_dbg, nullptr); + ASSERT_EQ(*captured_request_id_dbg, multiget_request_id); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_F(DBTest, MemEnvTest) { @@ -147,7 +264,7 @@ TEST_F(DBTest, MemEnvTest) { Options options; options.create_if_missing = true; options.env = env.get(); - DB* db; + std::unique_ptr db; const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; @@ -175,7 +292,7 @@ TEST_F(DBTest, MemEnvTest) { ASSERT_OK(iterator->status()); delete iterator; - DBImpl* dbi = static_cast_with_check(db); + DBImpl* dbi = static_cast_with_check(db.get()); ASSERT_OK(dbi->TEST_FlushMemTable()); for (size_t i = 0; i < 3; ++i) { @@ -184,7 +301,7 @@ TEST_F(DBTest, MemEnvTest) { ASSERT_TRUE(res == vals[i]); } - delete db; + db.reset(); options.create_if_missing = false; ASSERT_OK(DB::Open(options, "/dir/db", &db)); @@ -193,7 +310,7 @@ TEST_F(DBTest, MemEnvTest) { ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); ASSERT_TRUE(res == vals[i]); } - delete db; + db.reset(); } TEST_F(DBTest, WriteEmptyBatch) { @@ -961,7 +1078,9 @@ TEST_F(DBTest, WrongLevel0Config) { options.level0_stop_writes_trigger = 1; options.level0_slowdown_writes_trigger = 2; options.level0_file_num_compaction_trigger = 3; - ASSERT_OK(DB::Open(options, dbname_, &db_)); + { + ASSERT_OK(DB::Open(options, dbname_, &db_)); + } } TEST_F(DBTest, GetOrderedByLevels) { @@ -1090,8 +1209,10 @@ TEST_F(DBTest, FlushSchedule) { t.join(); } - auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); - auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); + auto default_tables = + GetNumberOfSstFilesForColumnFamily(db_.get(), "default"); + auto pikachu_tables = + GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"); ASSERT_LE(default_tables, static_cast(10)); ASSERT_GT(default_tables, static_cast(0)); ASSERT_LE(pikachu_tables, static_cast(10)); @@ -1161,12 +1282,6 @@ class DelayFilterFactory : public CompactionFilterFactory { }; } // anonymous namespace -static std::string CompressibleString(Random* rnd, int len) { - std::string r; - test::CompressibleString(rnd, 0.8, len, &r); - return r; -} - TEST_F(DBTest, FailMoreDbPaths) { Options options = CurrentOptions(); options.db_paths.emplace_back(dbname_, 10000000); @@ -1381,6 +1496,246 @@ TEST_F(DBTest, MetaDataTest) { CheckLiveFilesMeta(live_file_meta, files_by_level); } +TEST_F(DBTest, GetColumnFamilyMetaDataWithKeyRangeAndLevel) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + + int64_t temp_time = 0; + ASSERT_OK(options.env->GetCurrentTime(&temp_time)); + + DestroyAndReopen(options); + + Random rnd(301); + int key_index = 0; + for (int i = 0; i < 100; ++i) { + // Add a single blob reference to each file + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000, + /* offset */ 1234, /* size */ 5678, kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index), + blob_index)); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + + ++key_index; + + // Fill up the rest of the file with random values. + GenerateNewFile(&rnd, &key_index, /* nowait */ true); + + ASSERT_OK(Flush()); + } + + std::vector> files_by_level; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level); + + ASSERT_OK(options.env->GetCurrentTime(&temp_time)); + + ColumnFamilyMetaData cf_meta; + // Keys in the SST files are distributed + // (key000000, key000100) ->File 1 + // (key000101, key000201) -> File 2 + // (key000202, key000302) -> File 3 + // (key009999, key010099) -> File 100 + + // With keySlice (key000050, key000150) => should only pick 2 files(instead of + // default 100 that is in the level) + auto startKey = Slice("key000050"); + auto endKey = Slice("key000150"); + GetColumnFamilyMetaDataOptions cf_options(startKey, endKey, 0); + db_->GetColumnFamilyMetaData(cf_options, &cf_meta); + ASSERT_EQ(cf_meta.levels.size(), 1); + const auto& level_meta_from_cf = cf_meta.levels[0]; + ASSERT_EQ(level_meta_from_cf.files.size(), 2); + ASSERT_LT(level_meta_from_cf.files[1].smallestkey, + std::string(startKey.data())); + ASSERT_GT(level_meta_from_cf.files[0].largestkey, std::string(endKey.data())); + + GetColumnFamilyMetaDataOptions cf_option_default; + db_->GetColumnFamilyMetaData(cf_option_default, &cf_meta); + ASSERT_EQ(cf_meta.levels.size(), 1); + ASSERT_EQ(cf_meta.levels[0].files.size(), 100); + + // Test with start key valid and end key unbounded + // This should get all files from key000150 onwards (99 files) + auto startKeyUnbounded = Slice("key000150"); + GetColumnFamilyMetaDataOptions cf_options_unbounded_end(startKeyUnbounded, + OptSlice(), 0); + db_->GetColumnFamilyMetaData(cf_options_unbounded_end, &cf_meta); + ASSERT_EQ(cf_meta.levels.size(), 1); + ASSERT_EQ(cf_meta.levels[0].files.size(), 99); + + // Test with end key valid and start key unbounded + // This should get all files from beginning to key000250 ( 3 files) + auto endKeyUnbounded = Slice("key000250"); + GetColumnFamilyMetaDataOptions cf_options_unbounded_start(OptSlice(), + endKeyUnbounded, 0); + db_->GetColumnFamilyMetaData(cf_options_unbounded_start, &cf_meta); + ASSERT_EQ(cf_meta.levels.size(), 1); + ASSERT_EQ(cf_meta.levels[0].files.size(), 3); +} + +TEST_F(DBTest, GetColumnFamilyMetaDataBottommostLevel) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.num_levels = 7; + + DestroyAndReopen(options); + + Random rnd(301); + int key_index = 0; + + for (int i = 0; i < 100; ++i) { + GenerateNewFile(&rnd, &key_index, /* nowait */ true); + ASSERT_OK(Flush()); + } + + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + compact_options.change_level = true; + compact_options.target_level = 6; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + + // Nothing on Level 0 after compaction + ColumnFamilyMetaData cf_meta; + GetColumnFamilyMetaDataOptions cf_options_0(OptSlice(), OptSlice(), 0); + db_->GetColumnFamilyMetaData(cf_options_0, &cf_meta); + + ASSERT_EQ(cf_meta.levels.size(), 0); + ASSERT_EQ(cf_meta.file_count, 0); + + // Data should be in Level 6 + GetColumnFamilyMetaDataOptions cf_options(OptSlice(), OptSlice(), 6); + db_->GetColumnFamilyMetaData(cf_options, &cf_meta); + + ASSERT_EQ(cf_meta.levels.size(), 1); + ASSERT_EQ(cf_meta.levels[0].level, 6); + ASSERT_GT(cf_meta.levels[0].files.size(), 0); + size_t all_files = cf_meta.levels[0].files.size(); + + // Keys in the SST files are distributed across level 6 + // Test with key range - should only return files within the range + auto startKey = Slice("key000050"); + auto endKey = Slice("key000150"); + GetColumnFamilyMetaDataOptions cf_options_range(startKey, endKey, 6); + db_->GetColumnFamilyMetaData(cf_options_range, &cf_meta); + + ASSERT_EQ(cf_meta.levels.size(), 1); + ASSERT_EQ(cf_meta.levels[0].level, 6); + ASSERT_GT(cf_meta.levels[0].files.size(), 0); + size_t files_in_range = cf_meta.levels[0].files.size(); + + // Files in range should be less than or equal to all files + ASSERT_LE(files_in_range, all_files); +} + +TEST_F(DBTest, GetColumnFamilyMetaDataMultipleLevels) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.num_levels = 7; + + DestroyAndReopen(options); + + Random rnd(301); + int key_index = 0; + + for (int i = 0; i < 50; ++i) { + GenerateNewFile(&rnd, &key_index, /* nowait */ true); + ASSERT_OK(Flush()); + } + + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + compact_options.change_level = true; + compact_options.target_level = 6; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + + for (int i = 0; i < 30; ++i) { + GenerateNewFile(&rnd, &key_index, /* nowait */ true); + ASSERT_OK(Flush()); + } + + // First verify both levels have files without key range filter + ColumnFamilyMetaData cf_meta_all_no_range; + GetColumnFamilyMetaDataOptions cf_options_all_no_range; + db_->GetColumnFamilyMetaData(cf_options_all_no_range, &cf_meta_all_no_range); + + bool has_level_0 = false; + bool has_level_6 = false; + for (const auto& level : cf_meta_all_no_range.levels) { + if (level.level == 0 && level.files.size() > 0) { + has_level_0 = true; + } + if (level.level == 6 && level.files.size() > 0) { + has_level_6 = true; + } + } + + ASSERT_TRUE(has_level_0); + ASSERT_TRUE(has_level_6); + + // Test querying bottommost level only with key range + // Use a range that should be in the first set of files (now in level 6) + auto startKey = Slice("key000050"); + auto endKey = Slice("key000150"); + ColumnFamilyMetaData cf_meta_bottommost; + GetColumnFamilyMetaDataOptions cf_options_bottommost(startKey, endKey, 6); + db_->GetColumnFamilyMetaData(cf_options_bottommost, &cf_meta_bottommost); + + ASSERT_EQ(cf_meta_bottommost.levels.size(), 1); + ASSERT_EQ(cf_meta_bottommost.levels[0].level, 6); + ASSERT_GT(cf_meta_bottommost.levels[0].files.size(), 0); + size_t level_6_files_in_range = cf_meta_bottommost.levels[0].files.size(); + + // Test querying all levels with same key range + ColumnFamilyMetaData cf_meta_all; + GetColumnFamilyMetaDataOptions cf_options_all(startKey, endKey); + db_->GetColumnFamilyMetaData(cf_options_all, &cf_meta_all); + + size_t level_6_files_in_range_from_all = 0; + for (const auto& level : cf_meta_all.levels) { + if (level.level == 6) { + level_6_files_in_range_from_all = level.files.size(); + } + } + + ASSERT_GT(level_6_files_in_range_from_all, 0); + ASSERT_EQ(level_6_files_in_range, level_6_files_in_range_from_all); +} + +TEST_F(DBTest, GetColumnFamilyMetaDataEmptyDB) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.num_levels = 7; + + DestroyAndReopen(options); + + // Test on empty database + ColumnFamilyMetaData cf_meta_empty_db; + GetColumnFamilyMetaDataOptions cf_options_empty_db; + db_->GetColumnFamilyMetaData(cf_options_empty_db, &cf_meta_empty_db); + + ASSERT_EQ(cf_meta_empty_db.levels.size(), 0); + ASSERT_EQ(cf_meta_empty_db.file_count, 0); + ASSERT_EQ(cf_meta_empty_db.size, 0); + + // Test on empty database with key range + auto startKey = Slice("key000050"); + auto endKey = Slice("key000150"); + ColumnFamilyMetaData cf_meta_empty_range; + GetColumnFamilyMetaDataOptions cf_options_empty_range(startKey, endKey); + db_->GetColumnFamilyMetaData(cf_options_empty_range, &cf_meta_empty_range); + + ASSERT_EQ(cf_meta_empty_range.levels.size(), 0); + ASSERT_EQ(cf_meta_empty_range.file_count, 0); + ASSERT_EQ(cf_meta_empty_range.size, 0); +} + TEST_F(DBTest, AllMetaDataTest) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -2017,7 +2372,7 @@ TEST_F(DBTest, Snapshot) { ASSERT_OK(Put(1, "foo", "1v3")); { - ManagedSnapshot s3(db_); + ManagedSnapshot s3(db_.get()); ASSERT_EQ(3U, GetNumSnapshots()); ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); @@ -2374,37 +2729,43 @@ TEST_F(DBTest, DBOpen_Options) { ASSERT_OK(DestroyDB(dbname, options)); // Does not exist, and create_if_missing == false: error - DB* db = nullptr; + std::unique_ptr db; options.create_if_missing = false; - Status s = DB::Open(options, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + { + Status s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + } ASSERT_TRUE(db == nullptr); // Does not exist, and create_if_missing == true: OK options.create_if_missing = true; - s = DB::Open(options, dbname, &db); - ASSERT_OK(s); + { + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + } ASSERT_TRUE(db != nullptr); - delete db; - db = nullptr; + db.reset(); // Does exist, and error_if_exists == true: error options.create_if_missing = false; options.error_if_exists = true; - s = DB::Open(options, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + { + Status s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + } ASSERT_TRUE(db == nullptr); // Does exist, and error_if_exists == false: OK options.create_if_missing = true; options.error_if_exists = false; - s = DB::Open(options, dbname, &db); - ASSERT_OK(s); + { + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + } ASSERT_TRUE(db != nullptr); - delete db; - db = nullptr; + db.reset(); } TEST_F(DBTest, DBOpen_Change_NumLevels) { @@ -2442,25 +2803,36 @@ TEST_F(DBTest, DestroyDBMetaDatabase) { ASSERT_OK(DestroyDB(dbname, options)); // Setup databases - DB* db = nullptr; - ASSERT_OK(DB::Open(options, dbname, &db)); - delete db; - db = nullptr; - ASSERT_OK(DB::Open(options, metadbname, &db)); - delete db; - db = nullptr; - ASSERT_OK(DB::Open(options, metametadbname, &db)); - delete db; - db = nullptr; + { + std::unique_ptr db; + ASSERT_OK(DB::Open(options, dbname, &db)); + } + { + std::unique_ptr db; + ASSERT_OK(DB::Open(options, metadbname, &db)); + } + { + std::unique_ptr db; + ASSERT_OK(DB::Open(options, metametadbname, &db)); + } // Delete databases ASSERT_OK(DestroyDB(dbname, options)); // Check if deletion worked. options.create_if_missing = false; - ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok()); - ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok()); - ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); + { + std::unique_ptr dbptr; + ASSERT_TRUE(!(DB::Open(options, dbname, &dbptr)).ok()); + } + { + std::unique_ptr dbptr; + ASSERT_TRUE(!(DB::Open(options, metadbname, &dbptr)).ok()); + } + { + std::unique_ptr dbptr; + ASSERT_TRUE(!(DB::Open(options, metametadbname, &dbptr)).ok()); + } } TEST_F(DBTest, SnapshotFiles) { @@ -2539,13 +2911,11 @@ TEST_F(DBTest, SnapshotFiles) { column_families.emplace_back("default", ColumnFamilyOptions()); column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector cf_handles; - DB* snapdb; + std::unique_ptr snapdb; DBOptions opts; opts.env = env_; opts.create_if_missing = false; - Status stat = - DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); - ASSERT_OK(stat); + ASSERT_OK(DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb)); ReadOptions roptions; std::string val; @@ -2556,7 +2926,7 @@ TEST_F(DBTest, SnapshotFiles) { for (auto cfh : cf_handles) { delete cfh; } - delete snapdb; + snapdb.reset(); // look at the new live files after we added an 'extra' key // and after we took the first snapshot. @@ -2758,7 +3128,7 @@ struct MTThread { static void MTThreadBody(void* arg) { MTThread* t = static_cast(arg); int id = t->id; - DB* db = t->state->test->db_; + DB* db = t->state->test->db_.get(); int counter = 0; std::shared_ptr clock = SystemClock::Default(); auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U; @@ -2973,7 +3343,7 @@ TEST_F(DBTest, GroupCommitTest) { GCThread thread[kGCNumThreads]; for (int id = 0; id < kGCNumThreads; id++) { thread[id].id = id; - thread[id].db = db_; + thread[id].db = db_.get(); thread[id].done = false; env_->StartThread(GCThreadBody, &thread[id]); } @@ -3180,6 +3550,15 @@ class ModelDB : public DB { return Status(); } + using DB::GetPropertiesOfTablesByLevel; + Status GetPropertiesOfTablesByLevel( + ColumnFamilyHandle* /* column_family */, + std::vector< + std::unique_ptr>* /* props_by_level */) + override { + return Status(); + } + using DB::KeyMayExist; bool KeyMayExist(const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, @@ -3331,6 +3710,8 @@ class ModelDB : public DB { void EnableManualCompaction() override {} void DisableManualCompaction() override {} + void AbortAllCompactions() override {} + void ResumeAllCompactions() override {} Status WaitForCompact( const WaitForCompactOptions& /* wait_for_compact_options */) override { @@ -3340,11 +3721,6 @@ class ModelDB : public DB { using DB::NumberLevels; int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; } - using DB::MaxMemCompactionLevel; - int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override { - return 1; - } - using DB::Level0StopWriteTrigger; int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override { return -1; @@ -3401,7 +3777,7 @@ class ModelDB : public DB { } Status GetCurrentWalFile( - std::unique_ptr* /*current_log_file*/) override { + std::unique_ptr* /*current_wal_file*/) override { return Status::OK(); } @@ -3420,6 +3796,11 @@ class ModelDB : public DB { void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, ColumnFamilyMetaData* /*metadata*/) override {} + void GetColumnFamilyMetaData( + ColumnFamilyHandle* /*column_family*/, + const GetColumnFamilyMetaDataOptions& /*options*/, + ColumnFamilyMetaData* /*metadata*/) override {} + Status GetDbIdentity(std::string& /*identity*/) const override { return Status::OK(); } @@ -3440,6 +3821,11 @@ class ModelDB : public DB { return Status::OK(); } + Status GetNewestUserDefinedTimestamp( + ColumnFamilyHandle* /*cf*/, std::string* /*newest_timestamp*/) override { + return Status::OK(); + } + ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; } private: @@ -3629,8 +4015,10 @@ TEST_P(DBTestRandomized, Randomized) { // than return a key that is close to it. if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex && option_config_ != kBlockBasedTableWithPrefixHashIndex) { - ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); - ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + ASSERT_TRUE( + CompareIterators(step, &model, db_.get(), nullptr, nullptr)); + ASSERT_TRUE( + CompareIterators(step, &model, db_.get(), model_snap, db_snap)); } // Save a snapshot from each DB this time that we'll use next @@ -3644,7 +4032,7 @@ TEST_P(DBTestRandomized, Randomized) { } Reopen(options); - ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + ASSERT_TRUE(CompareIterators(step, &model, db_.get(), nullptr, nullptr)); model_snap = model.GetSnapshot(); db_snap = db_->GetSnapshot(); @@ -4814,7 +5202,7 @@ TEST_F(DBTest, DynamicMemtableOptions) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS namespace { bool VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, int expected_count) { @@ -5070,7 +5458,7 @@ TEST_P(DBTestWithParam, PreShutdownManualCompaction) { // Compact all MakeTables(1, "a", "z", 1); ASSERT_EQ("1,0,2", FilesPerLevel(1)); - CancelAllBackgroundWork(db_); + CancelAllBackgroundWork(db_.get()); ASSERT_TRUE( db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr) .IsShutdownInProgress()); @@ -5090,7 +5478,7 @@ TEST_F(DBTest, PreShutdownFlush) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "key", "value")); - CancelAllBackgroundWork(db_); + CancelAllBackgroundWork(db_.get()); Status s = db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); ASSERT_TRUE(s.IsShutdownInProgress()); @@ -5171,7 +5559,7 @@ TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); - CancelAllBackgroundWork(db_); + CancelAllBackgroundWork(db_.get()); TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); // Record the number of compactions at a time. @@ -5257,7 +5645,7 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { } ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); - CancelAllBackgroundWork(db_); + CancelAllBackgroundWork(db_.get()); TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); @@ -5272,279 +5660,13 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); } -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS TEST_F(DBTest, FlushOnDestroy) { WriteOptions wo; wo.disableWAL = true; ASSERT_OK(Put("foo", "v1", wo)); - CancelAllBackgroundWork(db_); -} - -TEST_F(DBTest, DynamicLevelCompressionPerLevel) { - if (!Snappy_Supported()) { - return; - } - const int kNKeys = 120; - int keys[kNKeys]; - for (int i = 0; i < kNKeys; i++) { - keys[i] = i; - } - - Random rnd(301); - Options options; - options.env = env_; - options.create_if_missing = true; - options.db_write_buffer_size = 20480; - options.write_buffer_size = 20480; - options.max_write_buffer_number = 2; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 2; - options.target_file_size_base = 20480; - options.level_compaction_dynamic_level_bytes = true; - options.max_bytes_for_level_base = 102400; - options.max_bytes_for_level_multiplier = 4; - options.max_background_compactions = 1; - options.num_levels = 5; - options.statistics = CreateDBStatistics(); - - options.compression_per_level.resize(3); - // No compression for L0 - options.compression_per_level[0] = kNoCompression; - // No compression for the Ln whre L0 is compacted to - options.compression_per_level[1] = kNoCompression; - // Snappy compression for Ln+1 - options.compression_per_level[2] = kSnappyCompression; - - OnFileDeletionListener* listener = new OnFileDeletionListener(); - options.listeners.emplace_back(listener); - - DestroyAndReopen(options); - - // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should - // be compressed, so there shouldn't be any compression. - for (int i = 0; i < 20; i++) { - ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); - ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); - } - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_EQ(NumTableFilesAtLevel(3), 0); - ASSERT_TRUE(NumTableFilesAtLevel(0) > 0 || NumTableFilesAtLevel(4) > 0); - - // Verify there was no compression - auto num_block_compressed = - options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); - ASSERT_EQ(num_block_compressed, 0); - - // Insert 400KB and there will be some files end up in L3. According to the - // above compression settings for each level, there will be some compression. - ASSERT_OK(options.statistics->Reset()); - ASSERT_EQ(num_block_compressed, 0); - for (int i = 20; i < 120; i++) { - ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); - ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); - } - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_GE(NumTableFilesAtLevel(3), 1); - ASSERT_GE(NumTableFilesAtLevel(4), 1); - - // Verify there was compression - num_block_compressed = - options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); - ASSERT_GT(num_block_compressed, 0); - - // Make sure data in files in L3 is not compacted by removing all files - // in L4 and calculate number of rows - ASSERT_OK(dbfull()->SetOptions({ - {"disable_auto_compactions", "true"}, - })); - ColumnFamilyMetaData cf_meta; - db_->GetColumnFamilyMetaData(&cf_meta); - - // Ensure that L1+ files are non-overlapping and together with L0 encompass - // full key range between smallestkey and largestkey from CF file metadata. - int largestkey_in_prev_level = -1; - int keys_found = 0; - for (int level = (int)cf_meta.levels.size() - 1; level >= 0; level--) { - int files_in_level = (int)cf_meta.levels[level].files.size(); - int largestkey_in_prev_file = -1; - for (int j = 0; j < files_in_level; j++) { - int smallestkey = IdFromKey(cf_meta.levels[level].files[j].smallestkey); - int largestkey = IdFromKey(cf_meta.levels[level].files[j].largestkey); - int num_entries = (int)cf_meta.levels[level].files[j].num_entries; - ASSERT_EQ(num_entries, largestkey - smallestkey + 1); - keys_found += num_entries; - if (level > 0) { - if (j == 0) { - ASSERT_GT(smallestkey, largestkey_in_prev_level); - } - if (j > 0) { - ASSERT_GT(smallestkey, largestkey_in_prev_file); - } - if (j == files_in_level - 1) { - largestkey_in_prev_level = largestkey; - } - } - largestkey_in_prev_file = largestkey; - } - } - ASSERT_EQ(keys_found, kNKeys); - - for (const auto& file : cf_meta.levels[4].files) { - listener->SetExpectedFileName(dbname_ + file.name); - Slice start(file.smallestkey), limit(file.largestkey); - const RangePtr ranges(&start, &limit); - // Given verification from above, we're guaranteed that by deleting all the - // files in [, ] range, we're effectively deleting - // that very single file and nothing more. - EXPECT_OK(dbfull()->DeleteFilesInRanges(dbfull()->DefaultColumnFamily(), - &ranges, true /* include_end */)); - } - listener->VerifyMatchedCount(cf_meta.levels[4].files.size()); - - int num_keys = 0; - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - num_keys++; - } - ASSERT_OK(iter->status()); - - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_GE(NumTableFilesAtLevel(3), 1); - ASSERT_EQ(NumTableFilesAtLevel(4), 0); - - ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U); -} - -TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { - if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) { - return; - } - const int kNKeys = 500; - int keys[kNKeys]; - for (int i = 0; i < kNKeys; i++) { - keys[i] = i; - } - RandomShuffle(std::begin(keys), std::end(keys)); - - Random rnd(301); - Options options; - options.create_if_missing = true; - options.db_write_buffer_size = 6000000; - options.write_buffer_size = 600000; - options.max_write_buffer_number = 2; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 2; - options.soft_pending_compaction_bytes_limit = 1024 * 1024; - options.target_file_size_base = 20; - options.env = env_; - options.level_compaction_dynamic_level_bytes = true; - options.max_bytes_for_level_base = 200; - options.max_bytes_for_level_multiplier = 8; - options.max_background_compactions = 1; - options.num_levels = 5; - std::shared_ptr mtf(new mock::MockTableFactory); - options.table_factory = mtf; - - options.compression_per_level.resize(3); - options.compression_per_level[0] = kNoCompression; - options.compression_per_level[1] = kLZ4Compression; - options.compression_per_level[2] = kZlibCompression; - - DestroyAndReopen(options); - // When base level is L4, L4 is LZ4. - std::atomic num_zlib(0); - std::atomic num_lz4(0); - std::atomic num_no(0); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = static_cast(arg); - if (compaction->output_level() == 4) { - ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); - num_lz4.fetch_add(1); - } - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { - auto* compression = static_cast(arg); - ASSERT_TRUE(*compression == kNoCompression); - num_no.fetch_add(1); - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - for (int i = 0; i < 100; i++) { - std::string value = rnd.RandomString(200); - ASSERT_OK(Put(Key(keys[i]), value)); - if (i % 25 == 24) { - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - } - } - - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_EQ(NumTableFilesAtLevel(3), 0); - ASSERT_GT(NumTableFilesAtLevel(4), 0); - ASSERT_GT(num_no.load(), 2); - ASSERT_GT(num_lz4.load(), 0); - int prev_num_files_l4 = NumTableFilesAtLevel(4); - - // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib - num_lz4.store(0); - num_no.store(0); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = static_cast(arg); - if (compaction->output_level() == 4 && compaction->start_level() == 3) { - ASSERT_TRUE(compaction->output_compression() == kZlibCompression); - num_zlib.fetch_add(1); - } else { - ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); - num_lz4.fetch_add(1); - } - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { - auto* compression = static_cast(arg); - ASSERT_TRUE(*compression == kNoCompression); - num_no.fetch_add(1); - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - for (int i = 101; i < 500; i++) { - std::string value = rnd.RandomString(200); - ASSERT_OK(Put(Key(keys[i]), value)); - if (i % 100 == 99) { - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - } - } - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); - ASSERT_GT(NumTableFilesAtLevel(3), 0); - ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4); - ASSERT_GT(num_no.load(), 2); - ASSERT_GT(num_lz4.load(), 0); - ASSERT_GT(num_zlib.load(), 0); + CancelAllBackgroundWork(db_.get()); } TEST_F(DBTest, DynamicCompactionOptions) { @@ -6083,53 +6205,6 @@ TEST_F(DBTest, L0L1L2AndUpHitCounter) { TestGetTickerCount(options, GET_HIT_L2_AND_UP)); } -TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { - // iter 0 -- zlib - // iter 1 -- bzip2 - // iter 2 -- lz4 - // iter 3 -- lz4HC - // iter 4 -- xpress - CompressionType compressions[] = {kZlibCompression, kBZip2Compression, - kLZ4Compression, kLZ4HCCompression, - kXpressCompression}; - for (auto comp : compressions) { - if (!CompressionTypeSupported(comp)) { - continue; - } - // first_table_version 1 -- generate with table_version == 1, read with - // table_version == 2 - // first_table_version 2 -- generate with table_version == 2, read with - // table_version == 1 - for (int first_table_version = 1; first_table_version <= 2; - ++first_table_version) { - BlockBasedTableOptions table_options; - table_options.format_version = first_table_version; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - Options options = CurrentOptions(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.create_if_missing = true; - options.compression = comp; - DestroyAndReopen(options); - - int kNumKeysWritten = 1000; - - Random rnd(301); - for (int i = 0; i < kNumKeysWritten; ++i) { - // compressible string - ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); - } - - table_options.format_version = first_table_version == 1 ? 2 : 1; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - for (int i = 0; i < kNumKeysWritten; ++i) { - auto r = Get(Key(i)); - ASSERT_EQ(r.substr(128), std::string(128, 'a')); - } - } - } -} - TEST_F(DBTest, CloseSpeedup) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleLevel; @@ -6254,9 +6329,9 @@ TEST_F(DBTest, MergeTestTime) { ASSERT_EQ(1, count); ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0); -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS } TEST_P(DBTestWithParam, MergeCompactionTimeTest) { @@ -6366,7 +6441,8 @@ TEST_P(DBTestWithParam, CompactionTotalTimeTest) { ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); // Hard-coded number in CompactionJob::ProcessKeyValueCompaction(). - const int kRecordStatsEvery = 1000; + // Uses 1024 (power of 2) for efficient bitwise check. + const int kRecordStatsEvery = 1024; // The stat COMPACTION_CPU_TOTAL_TIME should be recorded // during compaction and once more after compaction. ASSERT_EQ(n / kRecordStatsEvery + 1, record_count); @@ -6389,7 +6465,7 @@ TEST_F(DBTest, TestLogCleanup) { for (int i = 0; i < 100000; ++i) { ASSERT_OK(Put(Key(i), "val")); - // only 2 memtables will be alive, so logs_to_free needs to always be below + // only 2 memtables will be alive, so wals_to_free needs to always be below // 2 ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); } @@ -6458,7 +6534,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) { // compact it three times for (int i = 0; i < 3; ++i) { - ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + ASSERT_OK(experimental::SuggestCompactRange(db_.get(), nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } @@ -6471,7 +6547,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) { // nonoverlapping with the file on level 0 Slice start("a"), end("b"); - ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); // should not compact the level 0 file @@ -6479,7 +6555,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) { start = Slice("j"); end = Slice("m"); - ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); // SuggestCompactRange() is not going to be reported as manual compaction ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual( @@ -6530,7 +6606,7 @@ TEST_F(DBTest, SuggestCompactRangeUniversal) { // nonoverlapping with the file on level 0 Slice start("a"), end("b"); - ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); // should not compact the level 0 file @@ -6538,7 +6614,7 @@ TEST_F(DBTest, SuggestCompactRangeUniversal) { start = Slice("j"); end = Slice("m"); - ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); // now it should compact the level 0 file to the last level @@ -6575,7 +6651,7 @@ TEST_F(DBTest, PromoteL0) { ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 // Promote L0 level to L2. - ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2)); + ASSERT_OK(experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily(), 2)); // We expect that all the files were trivially moved from L0 to L2 ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files); @@ -6600,7 +6676,7 @@ TEST_F(DBTest, PromoteL0Failure) { Status status; // Fails because L0 has overlapping files. - status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + status = experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily()); ASSERT_TRUE(status.IsInvalidArgument()); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -6610,7 +6686,7 @@ TEST_F(DBTest, PromoteL0Failure) { ASSERT_OK(Put(Key(5), "")); ASSERT_OK(Flush()); // Fails because L1 is non-empty. - status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + status = experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily()); ASSERT_TRUE(status.IsInvalidArgument()); } @@ -7205,27 +7281,6 @@ TEST_F(DBTest, LastWriteBufferDelay) { } #endif // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) -TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { - CompressionType compressions[] = {kZlibCompression, kBZip2Compression, - kLZ4Compression, kLZ4HCCompression, - kXpressCompression}; - for (auto comp : compressions) { - if (!CompressionTypeSupported(comp)) { - // not supported, we should fail the Open() - Options options = CurrentOptions(); - options.compression = comp; - ASSERT_TRUE(!TryReopen(options).ok()); - // Try if CreateColumnFamily also fails - options.compression = kNoCompression; - ASSERT_OK(TryReopen(options)); - ColumnFamilyOptions cf_options(options); - cf_options.compression = comp; - ColumnFamilyHandle* handle; - ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok()); - } - } -} - TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) { Options options = CurrentOptions(); options.max_open_files = 100; @@ -7702,7 +7757,7 @@ TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) { }); TEST_SYNC_POINT("DBTest::ShuttingDownNotBlockStalledWrites"); - CancelAllBackgroundWork(db_, true); + CancelAllBackgroundWork(db_.get(), true); thd.join(); } diff --git a/db/db_test2.cc b/db/db_test2.cc index 6c4f6243719d..6129e2d923b8 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -10,7 +10,6 @@ #include #include #include -#include #include #include "db/db_test_util.h" @@ -40,7 +39,7 @@ class DBTest2 : public DBTestBase { }; TEST_F(DBTest2, OpenForReadOnly) { - DB* db_ptr = nullptr; + std::unique_ptr db_ptr; std::string dbname = test::PerThreadDBPath("db_readonly"); Options options = CurrentOptions(); options.create_if_missing = true; @@ -64,7 +63,7 @@ TEST_F(DBTest2, OpenForReadOnly) { } TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) { - DB* db_ptr = nullptr; + std::unique_ptr db_ptr; std::string dbname = test::PerThreadDBPath("db_readonly"); Options options = CurrentOptions(); options.create_if_missing = true; @@ -350,9 +349,9 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); ASSERT_OK(Flush(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(1)); flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; @@ -372,13 +371,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { // No flush should trigger wait_flush(); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(1)); } @@ -388,13 +387,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); wait_flush(); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(2)); } @@ -406,13 +405,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(2)); } @@ -429,13 +428,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); wait_flush(); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(2)); } @@ -451,13 +450,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { wait_flush(); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(2)); } if (cost_cache_) { @@ -507,7 +506,7 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { CreateAndReopenWithCF({"cf1", "cf2"}, options); ASSERT_OK(DestroyDB(dbname2, options)); - DB* db2 = nullptr; + std::unique_ptr db2; ASSERT_OK(DB::Open(options, dbname2, &db2)); WriteOptions wo; @@ -517,12 +516,12 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); - ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); + ASSERT_OK(static_cast(db2.get())->TEST_WaitForFlushMemTable()); // Ensure background work is fully finished including listener callbacks // before accessing listener state. ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); - ASSERT_OK( - static_cast_with_check(db2)->TEST_WaitForBackgroundWork()); + ASSERT_OK(static_cast_with_check(db2.get()) + ->TEST_WaitForBackgroundWork()); }; // Trigger a flush on cf2 @@ -538,13 +537,13 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); - ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); + ASSERT_OK(static_cast(db2.get())->TEST_WaitForFlushMemTable()); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") + - GetNumberOfSstFilesForColumnFamily(db_, "cf1") + - GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default") + + GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1") + + GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"), static_cast(0)); } @@ -554,13 +553,13 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"), static_cast(0)); } @@ -569,19 +568,19 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { wait_flush(); ASSERT_OK(db2->Put(wo, Key(1), DummyString(1))); wait_flush(); - ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); + ASSERT_OK(static_cast(db2.get())->TEST_WaitForFlushMemTable()); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"), static_cast(1)); } - delete db2; + db2.reset(); ASSERT_OK(DestroyDB(dbname2, options)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -786,7 +785,7 @@ TEST_F(DBTest2, WalFilterTest) { while (true) { // Ensure that expected keys exists // and not expected keys don't exist after recovery - ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist); if (checked_after_reopen) { break; @@ -923,7 +922,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { while (true) { // Ensure that expected keys exists // and not expected keys don't exist after recovery - ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist); if (checked_after_reopen) { break; @@ -1005,7 +1004,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { } } - ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist); } TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { @@ -1186,705 +1185,6 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { ASSERT_EQ(index, keys_cf.size()); } -TEST_F(DBTest2, PresetCompressionDict) { - // Verifies that compression ratio improves when dictionary is enabled, and - // improves even further when the dictionary is trained by ZSTD. - const size_t kBlockSizeBytes = 4 << 10; - const size_t kL0FileBytes = 128 << 10; - const size_t kApproxPerBlockOverheadBytes = 50; - const int kNumL0Files = 5; - - Options options; - // Make sure to use any custom env that the test is configured with. - options.env = CurrentOptions().env; - options.allow_concurrent_memtable_write = false; - options.arena_block_size = kBlockSizeBytes; - options.create_if_missing = true; - options.disable_auto_compactions = true; - options.level0_file_num_compaction_trigger = kNumL0Files; - options.memtable_factory.reset( - test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes)); - options.num_levels = 2; - options.target_file_size_base = kL0FileBytes; - options.target_file_size_multiplier = 2; - options.write_buffer_size = kL0FileBytes; - BlockBasedTableOptions table_options; - table_options.block_size = kBlockSizeBytes; - std::vector compression_types; - if (Zlib_Supported()) { - compression_types.push_back(kZlibCompression); - } -#if LZ4_VERSION_NUMBER >= 10400 // r124+ - compression_types.push_back(kLZ4Compression); - compression_types.push_back(kLZ4HCCompression); -#endif // LZ4_VERSION_NUMBER >= 10400 - if (ZSTD_Supported()) { - compression_types.push_back(kZSTD); - } - - enum DictionaryTypes : int { - kWithoutDict, - kWithDict, - kWithZSTDfinalizeDict, - kWithZSTDTrainedDict, - kDictEnd, - }; - - for (auto compression_type : compression_types) { - options.compression = compression_type; - size_t bytes_without_dict = 0; - size_t bytes_with_dict = 0; - size_t bytes_with_zstd_finalize_dict = 0; - size_t bytes_with_zstd_trained_dict = 0; - for (int i = kWithoutDict; i < kDictEnd; i++) { - // First iteration: compress without preset dictionary - // Second iteration: compress with preset dictionary - // Third iteration (zstd only): compress with zstd-trained dictionary - // - // To make sure the compression dictionary has the intended effect, we - // verify the compressed size is smaller in successive iterations. Also in - // the non-first iterations, verify the data we get out is the same data - // we put in. - switch (i) { - case kWithoutDict: - options.compression_opts.max_dict_bytes = 0; - options.compression_opts.zstd_max_train_bytes = 0; - break; - case kWithDict: - options.compression_opts.max_dict_bytes = kBlockSizeBytes; - options.compression_opts.zstd_max_train_bytes = 0; - break; - case kWithZSTDfinalizeDict: - if (compression_type != kZSTD || - !ZSTD_FinalizeDictionarySupported()) { - continue; - } - options.compression_opts.max_dict_bytes = kBlockSizeBytes; - options.compression_opts.zstd_max_train_bytes = kL0FileBytes; - options.compression_opts.use_zstd_dict_trainer = false; - break; - case kWithZSTDTrainedDict: - if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) { - continue; - } - options.compression_opts.max_dict_bytes = kBlockSizeBytes; - options.compression_opts.zstd_max_train_bytes = kL0FileBytes; - options.compression_opts.use_zstd_dict_trainer = true; - break; - default: - assert(false); - } - - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, options); - Random rnd(301); - std::string seq_datas[10]; - for (int j = 0; j < 10; ++j) { - seq_datas[j] = - rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes); - } - - ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); - for (int j = 0; j < kNumL0Files; ++j) { - for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) { - auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k; - ASSERT_OK(Put(1, Key(static_cast(key_num)), - seq_datas[(key_num / 10) % 10])); - } - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); - ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1)); - } - ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], - true /* disallow_trivial_move */)); - ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); - ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); - - // Get the live sst files size - size_t total_sst_bytes = TotalSize(1); - if (i == kWithoutDict) { - bytes_without_dict = total_sst_bytes; - } else if (i == kWithDict) { - bytes_with_dict = total_sst_bytes; - } else if (i == kWithZSTDfinalizeDict) { - bytes_with_zstd_finalize_dict = total_sst_bytes; - } else if (i == kWithZSTDTrainedDict) { - bytes_with_zstd_trained_dict = total_sst_bytes; - } - - for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes); - j++) { - ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast(j)))); - } - if (i == kWithDict) { - ASSERT_GT(bytes_without_dict, bytes_with_dict); - } else if (i == kWithZSTDTrainedDict) { - // In zstd compression, it is sometimes possible that using a finalized - // dictionary does not get as good a compression ratio as raw content - // dictionary. But using a dictionary should always get better - // compression ratio than not using one. - ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict || - bytes_without_dict > bytes_with_zstd_finalize_dict); - } else if (i == kWithZSTDTrainedDict) { - // In zstd compression, it is sometimes possible that using a trained - // dictionary does not get as good a compression ratio as without - // training. - // But using a dictionary (with or without training) should always get - // better compression ratio than not using one. - ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict || - bytes_without_dict > bytes_with_zstd_trained_dict); - } - - DestroyAndReopen(options); - } - } -} - -TEST_F(DBTest2, PresetCompressionDictLocality) { - if (!ZSTD_Supported()) { - return; - } - // Verifies that compression dictionary is generated from local data. The - // verification simply checks all output SSTs have different compression - // dictionaries. We do not verify effectiveness as that'd likely be flaky in - // the future. - const int kNumEntriesPerFile = 1 << 10; // 1KB - const int kNumBytesPerEntry = 1 << 10; // 1KB - const int kNumFiles = 4; - Options options = CurrentOptions(); - options.compression = kZSTD; - options.compression_opts.max_dict_bytes = 1 << 14; // 16KB - options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = true; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - - Random rnd(301); - for (int i = 0; i < kNumFiles; ++i) { - for (int j = 0; j < kNumEntriesPerFile; ++j) { - ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j), - rnd.RandomString(kNumBytesPerEntry))); - } - ASSERT_OK(Flush()); - MoveFilesToLevel(1); - ASSERT_EQ(NumTableFilesAtLevel(1), i + 1); - } - - // Store all the dictionaries generated during a full compaction. - std::vector compression_dicts; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", - [&](void* arg) { - compression_dicts.emplace_back(static_cast(arg)->ToString()); - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - CompactRangeOptions compact_range_opts; - compact_range_opts.bottommost_level_compaction = - BottommostLevelCompaction::kForceOptimized; - ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); - - // Dictionary compression should not be so good as to compress four totally - // random files into one. If it does then there's probably something wrong - // with the test. - ASSERT_GT(NumTableFilesAtLevel(1), 1); - - // Furthermore, there should be one compression dictionary generated per file. - // And they should all be different from each other. - ASSERT_EQ(NumTableFilesAtLevel(1), - static_cast(compression_dicts.size())); - for (size_t i = 1; i < compression_dicts.size(); ++i) { - std::string& a = compression_dicts[i - 1]; - std::string& b = compression_dicts[i]; - size_t alen = a.size(); - size_t blen = b.size(); - ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0); - } -} - -class PresetCompressionDictTest - : public DBTestBase, - public testing::WithParamInterface> { - public: - PresetCompressionDictTest() - : DBTestBase("db_test2", false /* env_do_fsync */), - compression_type_(std::get<0>(GetParam())), - bottommost_(std::get<1>(GetParam())) {} - - protected: - const CompressionType compression_type_; - const bool bottommost_; -}; - -INSTANTIATE_TEST_CASE_P( - DBTest2, PresetCompressionDictTest, - ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()), - ::testing::Bool())); - -TEST_P(PresetCompressionDictTest, Flush) { - // Verifies that dictionary is generated and written during flush only when - // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the - // size of the dictionary is within expectations according to the limit on - // buffering set by `CompressionOptions::max_dict_buffer_bytes`. - const size_t kValueLen = 256; - const size_t kKeysPerFile = 1 << 10; - const size_t kDictLen = 16 << 10; - const size_t kBlockLen = 4 << 10; - - Options options = CurrentOptions(); - if (bottommost_) { - options.bottommost_compression = compression_type_; - options.bottommost_compression_opts.enabled = true; - options.bottommost_compression_opts.max_dict_bytes = kDictLen; - options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; - } else { - options.compression = compression_type_; - options.compression_opts.max_dict_bytes = kDictLen; - options.compression_opts.max_dict_buffer_bytes = kBlockLen; - } - options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile)); - options.statistics = CreateDBStatistics(); - BlockBasedTableOptions bbto; - bbto.block_size = kBlockLen; - bbto.cache_index_and_filter_blocks = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - - Random rnd(301); - for (size_t i = 0; i <= kKeysPerFile; ++i) { - ASSERT_OK(Put(Key(static_cast(i)), rnd.RandomString(kValueLen))); - } - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - - // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a - // compression dictionary exists since dictionaries would be preloaded when - // the flush finishes. - if (bottommost_) { - // Flush is never considered bottommost. This should change in the future - // since flushed files may have nothing underneath them, like the one in - // this test case. - ASSERT_EQ( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - 0); - } else { - ASSERT_GT( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - 0); - // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on - // number of bytes needs to be adjusted in case the cached block is in - // ZSTD's digested dictionary format. - if (compression_type_ != kZSTD) { - // Although we limited buffering to `kBlockLen`, there may be up to two - // blocks of data included in the dictionary since we only check limit - // after each block is built. - ASSERT_LE(TestGetTickerCount(options, - BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - 2 * kBlockLen); - } - } -} - -TEST_P(PresetCompressionDictTest, CompactNonBottommost) { - // Verifies that dictionary is generated and written during compaction to - // non-bottommost level only when `ColumnFamilyOptions::compression` enables - // dictionary. Also verifies the size of the dictionary is within expectations - // according to the limit on buffering set by - // `CompressionOptions::max_dict_buffer_bytes`. - const size_t kValueLen = 256; - const size_t kKeysPerFile = 1 << 10; - const size_t kDictLen = 16 << 10; - const size_t kBlockLen = 4 << 10; - - Options options = CurrentOptions(); - if (bottommost_) { - options.bottommost_compression = compression_type_; - options.bottommost_compression_opts.enabled = true; - options.bottommost_compression_opts.max_dict_bytes = kDictLen; - options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; - } else { - options.compression = compression_type_; - options.compression_opts.max_dict_bytes = kDictLen; - options.compression_opts.max_dict_buffer_bytes = kBlockLen; - } - options.disable_auto_compactions = true; - options.statistics = CreateDBStatistics(); - BlockBasedTableOptions bbto; - bbto.block_size = kBlockLen; - bbto.cache_index_and_filter_blocks = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - - Random rnd(301); - for (size_t j = 0; j <= kKeysPerFile; ++j) { - ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); - } - ASSERT_OK(Flush()); - MoveFilesToLevel(2); - - for (int i = 0; i < 2; ++i) { - for (size_t j = 0; j <= kKeysPerFile; ++j) { - ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); - } - ASSERT_OK(Flush()); - } - ASSERT_EQ("2,0,1", FilesPerLevel(0)); - - uint64_t prev_compression_dict_bytes_inserted = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); - // This L0->L1 compaction merges the two L0 files into L1. The produced L1 - // file is not bottommost due to the existing L2 file covering the same key- - // range. - ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); - ASSERT_EQ("0,1,1", FilesPerLevel(0)); - // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a - // compression dictionary exists since dictionaries would be preloaded when - // the compaction finishes. - if (bottommost_) { - ASSERT_EQ( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - prev_compression_dict_bytes_inserted); - } else { - ASSERT_GT( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - prev_compression_dict_bytes_inserted); - // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on - // number of bytes needs to be adjusted in case the cached block is in - // ZSTD's digested dictionary format. - if (compression_type_ != kZSTD) { - // Although we limited buffering to `kBlockLen`, there may be up to two - // blocks of data included in the dictionary since we only check limit - // after each block is built. - ASSERT_LE(TestGetTickerCount(options, - BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - prev_compression_dict_bytes_inserted + 2 * kBlockLen); - } - } -} - -TEST_P(PresetCompressionDictTest, CompactBottommost) { - // Verifies that dictionary is generated and written during compaction to - // non-bottommost level only when either `ColumnFamilyOptions::compression` or - // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also - // verifies the size of the dictionary is within expectations according to the - // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`. - const size_t kValueLen = 256; - const size_t kKeysPerFile = 1 << 10; - const size_t kDictLen = 16 << 10; - const size_t kBlockLen = 4 << 10; - - Options options = CurrentOptions(); - if (bottommost_) { - options.bottommost_compression = compression_type_; - options.bottommost_compression_opts.enabled = true; - options.bottommost_compression_opts.max_dict_bytes = kDictLen; - options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; - } else { - options.compression = compression_type_; - options.compression_opts.max_dict_bytes = kDictLen; - options.compression_opts.max_dict_buffer_bytes = kBlockLen; - } - options.disable_auto_compactions = true; - options.statistics = CreateDBStatistics(); - BlockBasedTableOptions bbto; - bbto.block_size = kBlockLen; - bbto.cache_index_and_filter_blocks = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); - - Random rnd(301); - for (int i = 0; i < 2; ++i) { - for (size_t j = 0; j <= kKeysPerFile; ++j) { - ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); - } - ASSERT_OK(Flush()); - } - ASSERT_EQ("2", FilesPerLevel(0)); - - uint64_t prev_compression_dict_bytes_inserted = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); - CompactRangeOptions cro; - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - ASSERT_EQ("0,1", FilesPerLevel(0)); - ASSERT_GT( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - prev_compression_dict_bytes_inserted); - // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on - // number of bytes needs to be adjusted in case the cached block is in ZSTD's - // digested dictionary format. - if (compression_type_ != kZSTD) { - // Although we limited buffering to `kBlockLen`, there may be up to two - // blocks of data included in the dictionary since we only check limit after - // each block is built. - ASSERT_LE( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - prev_compression_dict_bytes_inserted + 2 * kBlockLen); - } -} - -class CompactionCompressionListener : public EventListener { - public: - explicit CompactionCompressionListener(Options* db_options) - : db_options_(db_options) {} - - void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { - // Figure out last level with files - int bottommost_level = 0; - for (int level = 0; level < db->NumberLevels(); level++) { - std::string files_at_level; - ASSERT_TRUE( - db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level), - &files_at_level)); - if (files_at_level != "0") { - bottommost_level = level; - } - } - - if (db_options_->bottommost_compression != kDisableCompressionOption && - ci.output_level == bottommost_level) { - ASSERT_EQ(ci.compression, db_options_->bottommost_compression); - } else if (db_options_->compression_per_level.size() != 0) { - ASSERT_EQ(ci.compression, - db_options_->compression_per_level[ci.output_level]); - } else { - ASSERT_EQ(ci.compression, db_options_->compression); - } - max_level_checked = std::max(max_level_checked, ci.output_level); - } - - int max_level_checked = 0; - const Options* db_options_; -}; - -enum CompressionFailureType { - kTestCompressionFail, - kTestDecompressionFail, - kTestDecompressionCorruption -}; - -class CompressionFailuresTest - : public DBTest2, - public testing::WithParamInterface> { - public: - CompressionFailuresTest() { - std::tie(compression_failure_type_, compression_type_, - compression_max_dict_bytes_, compression_parallel_threads_) = - GetParam(); - } - - CompressionFailureType compression_failure_type_ = kTestCompressionFail; - CompressionType compression_type_ = kNoCompression; - uint32_t compression_max_dict_bytes_ = 0; - uint32_t compression_parallel_threads_ = 0; -}; - -INSTANTIATE_TEST_CASE_P( - DBTest2, CompressionFailuresTest, - ::testing::Combine(::testing::Values(kTestCompressionFail, - kTestDecompressionFail, - kTestDecompressionCorruption), - ::testing::ValuesIn(GetSupportedCompressions()), - ::testing::Values(0, 10), ::testing::Values(1, 4))); - -TEST_P(CompressionFailuresTest, CompressionFailures) { - if (compression_type_ == kNoCompression) { - return; - } - - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 2; - options.max_bytes_for_level_base = 1024; - options.max_bytes_for_level_multiplier = 2; - options.num_levels = 7; - options.max_background_compactions = 1; - options.target_file_size_base = 512; - - BlockBasedTableOptions table_options; - table_options.block_size = 512; - table_options.verify_compression = true; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - options.compression = compression_type_; - options.compression_opts.parallel_threads = compression_parallel_threads_; - options.compression_opts.max_dict_bytes = compression_max_dict_bytes_; - options.bottommost_compression_opts.parallel_threads = - compression_parallel_threads_; - options.bottommost_compression_opts.max_dict_bytes = - compression_max_dict_bytes_; - - if (compression_failure_type_ == kTestCompressionFail) { - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "CompressData:TamperWithReturnValue", [](void* arg) { - bool* ret = static_cast(arg); - *ret = false; - }); - } else if (compression_failure_type_ == kTestDecompressionFail) { - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "UncompressBlockData:TamperWithReturnValue", [](void* arg) { - Status* ret = static_cast(arg); - ASSERT_OK(*ret); - *ret = Status::Corruption("kTestDecompressionFail"); - }); - } else if (compression_failure_type_ == kTestDecompressionCorruption) { - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "UncompressBlockData:" - "TamperWithDecompressionOutput", - [](void* arg) { - BlockContents* contents = static_cast(arg); - // Ensure uncompressed data != original data - const size_t len = contents->data.size() + 1; - std::unique_ptr fake_data(new char[len]()); - *contents = BlockContents(std::move(fake_data), len); - }); - } - - std::map key_value_written; - - const int kKeySize = 5; - const int kValUnitSize = 16; - const int kValSize = 256; - Random rnd(405); - - Status s = Status::OK(); - - DestroyAndReopen(options); - // Write 10 random files - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 5; j++) { - std::string key = rnd.RandomString(kKeySize); - // Ensure good compression ratio - std::string valueUnit = rnd.RandomString(kValUnitSize); - std::string value; - for (int k = 0; k < kValSize; k += kValUnitSize) { - value += valueUnit; - } - s = Put(key, value); - if (compression_failure_type_ == kTestCompressionFail) { - key_value_written[key] = value; - ASSERT_OK(s); - } - } - s = Flush(); - if (compression_failure_type_ == kTestCompressionFail) { - ASSERT_OK(s); - } - s = dbfull()->TEST_WaitForCompact(); - if (compression_failure_type_ == kTestCompressionFail) { - ASSERT_OK(s); - } - if (i == 4) { - // Make compression fail at the mid of table building - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - } - } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - - if (compression_failure_type_ == kTestCompressionFail) { - // Should be kNoCompression, check content consistency - std::unique_ptr db_iter(db_->NewIterator(ReadOptions())); - for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { - std::string key = db_iter->key().ToString(); - std::string value = db_iter->value().ToString(); - ASSERT_NE(key_value_written.find(key), key_value_written.end()); - ASSERT_EQ(key_value_written[key], value); - key_value_written.erase(key); - } - ASSERT_OK(db_iter->status()); - ASSERT_EQ(0, key_value_written.size()); - } else if (compression_failure_type_ == kTestDecompressionFail) { - ASSERT_EQ(std::string(s.getState()), - "Could not decompress: kTestDecompressionFail"); - } else if (compression_failure_type_ == kTestDecompressionCorruption) { - ASSERT_EQ(std::string(s.getState()), - "Decompressed block did not match pre-compression block"); - } -} - -TEST_F(DBTest2, CompressionOptions) { - if (!Zlib_Supported() || !Snappy_Supported()) { - return; - } - - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 2; - options.max_bytes_for_level_base = 100; - options.max_bytes_for_level_multiplier = 2; - options.num_levels = 7; - options.max_background_compactions = 1; - - CompactionCompressionListener* listener = - new CompactionCompressionListener(&options); - options.listeners.emplace_back(listener); - - const int kKeySize = 5; - const int kValSize = 20; - Random rnd(301); - - std::vector compression_parallel_threads = {1, 4}; - - std::map key_value_written; - - for (int iter = 0; iter <= 2; iter++) { - listener->max_level_checked = 0; - - if (iter == 0) { - // Use different compression algorithms for different levels but - // always use Zlib for bottommost level - options.compression_per_level = {kNoCompression, kNoCompression, - kNoCompression, kSnappyCompression, - kSnappyCompression, kSnappyCompression, - kZlibCompression}; - options.compression = kNoCompression; - options.bottommost_compression = kZlibCompression; - } else if (iter == 1) { - // Use Snappy except for bottommost level use ZLib - options.compression_per_level = {}; - options.compression = kSnappyCompression; - options.bottommost_compression = kZlibCompression; - } else if (iter == 2) { - // Use Snappy everywhere - options.compression_per_level = {}; - options.compression = kSnappyCompression; - options.bottommost_compression = kDisableCompressionOption; - } - - for (auto num_threads : compression_parallel_threads) { - options.compression_opts.parallel_threads = num_threads; - options.bottommost_compression_opts.parallel_threads = num_threads; - - DestroyAndReopen(options); - // Write 10 random files - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 5; j++) { - std::string key = rnd.RandomString(kKeySize); - std::string value = rnd.RandomString(kValSize); - key_value_written[key] = value; - ASSERT_OK(Put(key, value)); - } - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - } - - // Make sure that we wrote enough to check all 7 levels - ASSERT_EQ(listener->max_level_checked, 6); - - // Make sure database content is the same as key_value_written - std::unique_ptr db_iter(db_->NewIterator(ReadOptions())); - for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { - std::string key = db_iter->key().ToString(); - std::string value = db_iter->value().ToString(); - ASSERT_NE(key_value_written.find(key), key_value_written.end()); - ASSERT_EQ(key_value_written[key], value); - key_value_written.erase(key); - } - ASSERT_OK(db_iter->status()); - ASSERT_EQ(0, key_value_written.size()); - } - } -} - class CompactionStallTestListener : public EventListener { public: CompactionStallTestListener() @@ -1992,7 +1292,7 @@ TEST_F(DBTest2, DuplicateSnapshot) { Options options; options = CurrentOptions(options); std::vector snapshots; - DBImpl* dbi = static_cast_with_check(db_); + DBImpl* dbi = dbfull(); SequenceNumber oldest_ww_snap, first_ww_snap; ASSERT_OK(Put("k", "v")); // inc seq @@ -3010,7 +2310,7 @@ TEST_F(DBTest2, PausingManualCompaction1) { "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) { auto paused = static_cast*>(arg); // CompactFiles() relies on manual_compactions_paused to - // determine if thie compaction should be paused or not + // determine if this compaction should be paused or not ASSERT_EQ(0, paused->load(std::memory_order_acquire)); paused->fetch_add(1, std::memory_order_release); }); @@ -3122,6 +2422,7 @@ TEST_F(DBTest2, PausingManualCompaction3) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); dbfull()->DisableManualCompaction(); + ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); @@ -4393,16 +3694,16 @@ TEST_F(DBTest2, TraceAndReplay) { // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). - DB* db2_init = nullptr; + std::unique_ptr db2_init; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; - delete db2_init; + db2_init.reset(); - DB* db2 = nullptr; + std::unique_ptr db2; std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); @@ -4489,7 +3790,7 @@ TEST_F(DBTest2, TraceAndReplay) { for (auto handle : handles) { delete handle; } - delete db2; + db2.reset(); ASSERT_OK(DestroyDB(dbname2, options)); } @@ -4584,16 +3885,16 @@ TEST_F(DBTest2, TraceAndManualReplay) { // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). - DB* db2_init = nullptr; + std::unique_ptr db2_init; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; - delete db2_init; + db2_init.reset(); - DB* db2 = nullptr; + std::unique_ptr db2; std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); @@ -4829,7 +4130,7 @@ TEST_F(DBTest2, TraceAndManualReplay) { for (auto handle : handles) { delete handle; } - delete db2; + db2.reset(); ASSERT_OK(DestroyDB(dbname2, options)); } @@ -4860,16 +4161,16 @@ TEST_F(DBTest2, TraceWithLimit) { // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). - DB* db2_init = nullptr; + std::unique_ptr db2_init; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; - delete db2_init; + db2_init.reset(); - DB* db2 = nullptr; + std::unique_ptr db2; std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); @@ -4902,7 +4203,7 @@ TEST_F(DBTest2, TraceWithLimit) { for (auto handle : handles) { delete handle; } - delete db2; + db2.reset(); ASSERT_OK(DestroyDB(dbname2, options)); } @@ -4934,16 +4235,16 @@ TEST_F(DBTest2, TraceWithSampling) { // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). - DB* db2_init = nullptr; + std::unique_ptr db2_init; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; - delete db2_init; + db2_init.reset(); - DB* db2 = nullptr; + std::unique_ptr db2; std::vector column_families; ColumnFamilyOptions cf_options; column_families.emplace_back("default", cf_options); @@ -4978,7 +4279,7 @@ TEST_F(DBTest2, TraceWithSampling) { for (auto handle : handles) { delete handle; } - delete db2; + db2.reset(); ASSERT_OK(DestroyDB(dbname2, options)); } @@ -5038,16 +4339,16 @@ TEST_F(DBTest2, TraceWithFilter) { // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). - DB* db2_init = nullptr; + std::unique_ptr db2_init; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; - delete db2_init; + db2_init.reset(); - DB* db2 = nullptr; + std::unique_ptr db2; std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); @@ -5083,28 +4384,28 @@ TEST_F(DBTest2, TraceWithFilter) { for (auto handle : handles) { delete handle; } - delete db2; + db2.reset(); ASSERT_OK(DestroyDB(dbname2, options)); // Set up a new db. std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read"); ASSERT_OK(DestroyDB(dbname3, options)); - DB* db3_init = nullptr; + std::unique_ptr db3_init; options.create_if_missing = true; ColumnFamilyHandle* cf3; ASSERT_OK(DB::Open(options, dbname3, &db3_init)); ASSERT_OK( db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3)); delete cf3; - delete db3_init; + db3_init.reset(); column_families.clear(); column_families.emplace_back("default", cf_options); column_families.emplace_back("pikachu", ColumnFamilyOptions()); handles.clear(); - DB* db3 = nullptr; + std::unique_ptr db3; ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3)); env_->SleepForMicroseconds(100); @@ -5134,7 +4435,7 @@ TEST_F(DBTest2, TraceWithFilter) { for (auto handle : handles) { delete handle; } - delete db3; + db3.reset(); ASSERT_OK(DestroyDB(dbname3, options)); std::unique_ptr trace_reader3; @@ -5325,7 +4626,7 @@ TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { CreateColumnFamilies({"test1", "test2"}, Options()); ASSERT_EQ(handles_.size(), 2); - DBImpl* dbi = static_cast_with_check(db_); + DBImpl* dbi = dbfull(); port::Thread user_thread1([&]() { auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID()); ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); @@ -5421,6 +4722,103 @@ TEST_F(DBTest2, TestCompactFiles) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } +TEST_F(DBTest2, TestCancelCompactFiles) { + SyncPoint::GetInstance()->EnableProcessing(); + + Options options; + options.env = env_; + options.num_levels = 2; + options.disable_auto_compactions = true; + Reopen(options); + + auto* handle = db_->DefaultColumnFamily(); + ASSERT_EQ(db_->NumberLevels(handle), 2); + + ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{ + ROCKSDB_NAMESPACE::EnvOptions(), options}; + + // ingest large SST files + std::vector external_sst_file_names; + int key_counter = 0; + const int num_keys_per_file = 100000; + const int num_files = 10; + for (int i = 0; i < num_files; ++i) { + std::string file_name = + dbname_ + "/test_compact_files" + std::to_string(i) + ".sst_t"; + external_sst_file_names.push_back(file_name); + ASSERT_OK(sst_file_writer.Open(file_name)); + for (int j = 0; j < num_keys_per_file; ++j) { + ASSERT_OK(sst_file_writer.Put(Key(j + num_keys_per_file * key_counter), + std::to_string(j))); + } + key_counter += 1; + ASSERT_OK(sst_file_writer.Finish()); + } + + ASSERT_OK(db_->IngestExternalFile(handle, external_sst_file_names, + IngestExternalFileOptions())); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files); + std::vector files; + GetSstFiles(env_, dbname_, &files); + ASSERT_EQ(files.size(), num_files); + + // Test that 0 compactions happen - canceled is set to True initially + CompactionOptions compaction_options; + std::atomic canceled(true); + compaction_options.canceled = &canceled; + + ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1) + .IsManualCompactionPaused()); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files); + + // Test cancellation before the check to cancel compaction happens - + // compaction should not occur + bool disable_compaction = false; + compaction_options.canceled->store(false, std::memory_order_release); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TestCancelCompactFiles:SuccessfulCompaction", [&](void* arg) { + auto paused = static_cast*>(arg); + if (disable_compaction) { + db_->DisableManualCompaction(); + ASSERT_EQ(1, paused->load(std::memory_order_acquire)); + } else { + compaction_options.canceled->store(true, std::memory_order_release); + ASSERT_EQ(0, paused->load(std::memory_order_acquire)); + } + }); + + ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1) + .IsManualCompactionPaused()); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files); + + // DisableManualCompaction() should successfully cancel compaction + disable_compaction = true; + compaction_options.canceled->store(false, std::memory_order_release); + ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1) + .IsManualCompactionPaused()); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files); + // unlike CompactRange, value of compaction_options.canceled will be + // unaffected by calling DisableManualCompactions() + ASSERT_FALSE(compaction_options.canceled->load()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + db_->EnableManualCompaction(); + + // Test cancelation after the check to cancel compaction - compaction should + // occur, leaving only 1 file + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactFilesImpl:0", [&](void* /*arg*/) { + compaction_options.canceled->store(true, std::memory_order_release); + }); + + compaction_options.canceled->store(false, std::memory_order_release); + ASSERT_OK(db_->CompactFiles(compaction_options, handle, files, 1)); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + TEST_F(DBTest2, MultiDBParallelOpenTest) { const int kNumDbs = 2; Options options = CurrentOptions(); @@ -5432,7 +4830,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) { // Verify empty DBs can be created in parallel std::vector open_threads; - std::vector dbs{static_cast(kNumDbs), nullptr}; + std::vector> dbs(kNumDbs); options.create_if_missing = true; for (int i = 0; i < kNumDbs; ++i) { open_threads.emplace_back( @@ -5447,7 +4845,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) { for (int i = 0; i < kNumDbs; ++i) { open_threads[i].join(); ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua")); - delete dbs[i]; + dbs[i].reset(); } // Verify non-empty DBs can be recovered in parallel @@ -5463,7 +4861,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) { // Wait and cleanup for (int i = 0; i < kNumDbs; ++i) { open_threads[i].join(); - delete dbs[i]; + dbs[i].reset(); ASSERT_OK(DestroyDB(dbnames[i], options)); } } @@ -5524,8 +4922,7 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { ASSERT_NOK(db_->Close()); db_->ReleaseSnapshot(ss); ASSERT_OK(db_->Close()); - delete db_; - db_ = nullptr; + db_.reset(); } TEST_F(DBTest2, PrefixBloomReseek) { @@ -5807,6 +5204,7 @@ TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) { Options options = CurrentOptions(); DestroyAndReopen(options); options.max_manifest_file_size = 10; + options.max_manifest_space_amp_pct = 0; options.create_if_missing = true; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_EQ(2, handles_.size()); @@ -6498,6 +5896,7 @@ TEST_P(RenameCurrentTest, Flush) { Destroy(last_options_); Options options = GetDefaultOptions(); options.max_manifest_file_size = 1; + options.max_manifest_space_amp_pct = 0; options.create_if_missing = true; Reopen(options); ASSERT_OK(Put("key", "value")); @@ -6517,6 +5916,7 @@ TEST_P(RenameCurrentTest, Compaction) { Destroy(last_options_); Options options = GetDefaultOptions(); options.max_manifest_file_size = 1; + options.max_manifest_space_amp_pct = 0; options.create_if_missing = true; Reopen(options); ASSERT_OK(Put("a", "a_value")); @@ -6665,15 +6065,9 @@ TEST_F(DBTest2, VariousFileTemperatures) { }; // We don't have enough non-unknown temps to confidently distinguish that - // a specific setting caused a specific outcome, in a single run. This is a - // reasonable work-around without blowing up test time. Only returns - // non-unknown temperatures. - auto RandomTemp = [] { - static std::vector temps = { - Temperature::kHot, Temperature::kWarm, Temperature::kCold}; - return temps[Random::GetTLSInstance()->Uniform( - static_cast(temps.size()))]; - }; + // a specific setting caused a specific outcome, in a single run. Using + // RandomKnownTemperature() is a reasonable work-around without blowing up + // test time. auto test_fs = std::make_shared(env_->GetFileSystem()); std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); @@ -6689,22 +6083,22 @@ TEST_F(DBTest2, VariousFileTemperatures) { options.env = env.get(); test_fs->Reset(); if (use_optimize) { - test_fs->optimize_manifest_temperature = RandomTemp(); + test_fs->optimize_manifest_temperature = RandomKnownTemperature(); test_fs->expected_manifest_temperature = test_fs->optimize_manifest_temperature; - test_fs->optimize_wal_temperature = RandomTemp(); + test_fs->optimize_wal_temperature = RandomKnownTemperature(); test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature; } if (use_temp_options) { - options.metadata_write_temperature = RandomTemp(); + options.metadata_write_temperature = RandomKnownTemperature(); test_fs->expected_manifest_temperature = options.metadata_write_temperature; test_fs->expected_other_metadata_temperature = options.metadata_write_temperature; - options.wal_write_temperature = RandomTemp(); + options.wal_write_temperature = RandomKnownTemperature(); test_fs->expected_wal_temperature = options.wal_write_temperature; - options.last_level_temperature = RandomTemp(); - options.default_write_temperature = RandomTemp(); + options.last_level_temperature = RandomKnownTemperature(); + options.default_write_temperature = RandomKnownTemperature(); } DestroyAndReopen(options); @@ -7149,6 +6543,9 @@ TEST_F(DBTest2, LastLevelStatistics) { DestroyAndReopen(options); + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + // generate 1 sst on level 0 ASSERT_OK(Put("foo1", "bar")); ASSERT_OK(Put("bar", "bar")); @@ -7249,9 +6646,87 @@ TEST_F(DBTest2, LastLevelStatistics) { // Control ASSERT_NE(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT)); + + // Control: unknown temperature iostats should be zero since files have + // explicit temperatures (mapped or written) + EXPECT_EQ( + iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read, + 0); + EXPECT_EQ( + iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count, + 0); + EXPECT_EQ( + iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read, 0); + EXPECT_EQ( + iostats->file_io_stats_by_temperature.unknown_last_level_read_count, 0); } } +// Test the iostats for files with Temperature::kUnknown that is not mapped +// to another temperature. These stats are used to indicate which non-tiered +// workloads are most promising for tiering (so this test doesn't set +// temperatures). +TEST_F(DBTest2, UnknownLastLevelStatistics) { + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + + // Generate 1 sst file on level 0 with kUnknown temperature + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + // Read from the kUnknown file on non-last level + ASSERT_EQ("bar", Get("foo")); + + // Verify unknown_non_last_level stats are populated + EXPECT_GT( + iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read, + 0); + EXPECT_GT( + iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count, + 0); + // No reads from last level yet + EXPECT_EQ(iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read, + 0); + EXPECT_EQ(iostats->file_io_stats_by_temperature.unknown_last_level_read_count, + 0); + + // Compact to the last level (level 6) explicitly using MoveFilesToLevel + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + MoveFilesToLevel(6); + + // Reopen DB to ensure table cache is cleared and files are re-opened + // with correct is_last_level flag + Reopen(options); + + // Reset iostats to measure only the following reads + get_iostats_context()->Reset(); + + // Read from the file now on last level (still kUnknown since + // last_level_temperature is not set) + ASSERT_EQ("bar", Get("foo")); + + // Verify unknown_last_level stats are populated + EXPECT_GT(iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read, + 0); + EXPECT_GT(iostats->file_io_stats_by_temperature.unknown_last_level_read_count, + 0); + // No new reads from non-last level + EXPECT_EQ( + iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read, + 0); + EXPECT_EQ( + iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count, + 0); +} + TEST_F(DBTest2, CheckpointFileTemperature) { class NoLinkTestFS : public FileTemperatureTestFS { using FileTemperatureTestFS::FileTemperatureTestFS; @@ -7298,7 +6773,7 @@ TEST_F(DBTest2, CheckpointFileTemperature) { test_fs->PopRequestedSstFileTemperatures(); Checkpoint* checkpoint; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint)); ASSERT_OK( checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp")); @@ -8060,7 +7535,7 @@ TEST_F(DBTest2, GetFileChecksumsFromCurrentManifest_CRC32) { opts.level0_file_num_compaction_trigger = 10; // Bootstrap the test database. - DB* db = nullptr; + std::unique_ptr db; std::string dbname = test::PerThreadDBPath("file_chksum"); ASSERT_OK(DB::Open(opts, dbname, &db)); @@ -8068,18 +7543,33 @@ TEST_F(DBTest2, GetFileChecksumsFromCurrentManifest_CRC32) { FlushOptions fopts; fopts.wait = true; Random rnd(test::RandomSeed()); + + // Write 4 files into the default column family. for (int i = 0; i < 4; i++) { ASSERT_OK(db->Put(wopts, Key(i), rnd.RandomString(100))); ASSERT_OK(db->Flush(fopts)); } + // Create a new column family, write 1 file into it and drop it. + ColumnFamilyHandle* cf; + ASSERT_OK( + db->CreateColumnFamily(ColumnFamilyOptions(), "soon_to_be_deleted", &cf)); + ASSERT_OK(db->Put(wopts, cf, "some_key", "some_value")); + ASSERT_OK(db->Flush(fopts, cf)); + + // Drop column family should generate corresponding version edit + // in manifest, which we expect to be correctly interpreted by + // GetFileChecksumsFromCurrentManifest API after db close. + ASSERT_OK(db->DropColumnFamily(cf)); + delete cf; + cf = nullptr; + // Obtain rich files metadata for source of truth. std::vector live_files; db->GetLiveFilesMetaData(&live_files); ASSERT_OK(db->Close()); - delete db; - db = nullptr; + db.reset(); // Process current MANIFEST file and build internal file checksum mappings. std::unique_ptr checksum_list(NewFileChecksumList()); diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 3944e92a0dc0..d62807d265c4 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -11,6 +11,7 @@ #include "cache/cache_reservation_manager.h" #include "db/forward_iterator.h" +#include "env/fs_readonly.h" #include "env/mock_env.h" #include "port/lang.h" #include "rocksdb/cache.h" @@ -70,9 +71,9 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) if (getenv("MEM_ENV")) { mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock()); } - if (getenv("ENCRYPTED_ENV")) { + if (auto ee = getenv("ENCRYPTED_ENV")) { std::shared_ptr provider; - std::string provider_id = getenv("ENCRYPTED_ENV"); + std::string provider_id = ee; if (provider_id.find('=') == std::string::npos && !EndsWith(provider_id, "://test")) { provider_id = provider_id + "://test"; @@ -96,7 +97,7 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) EXPECT_OK(DestroyDB(dbname_, delete_options)); // Destroy it for not alternative WAL dir is used. EXPECT_OK(DestroyDB(dbname_, options)); - db_ = nullptr; + db_.reset(); Reopen(options); Random::GetTLSInstance()->Reset(0xdeadbeef); } @@ -365,11 +366,6 @@ Options DBTestBase::GetOptions( table_options.block_cache = NewLRUCache(/* too small */ 1); } - // Test anticipated new default as much as reasonably possible (and remove - // this code when obsolete) - assert(!table_options.decouple_partitioned_filters); - table_options.decouple_partitioned_filters = true; - bool can_allow_mmap = IsMemoryMappedAccessSupported(); switch (option_config) { case kHashSkipList: @@ -458,7 +454,8 @@ Options DBTestBase::GetOptions( options.allow_mmap_reads = can_allow_mmap; break; case kManifestFileSize: - options.max_manifest_file_size = 50; // 50 bytes + options.max_manifest_file_size = 50; // 50 bytes + options.max_manifest_space_amp_pct = 0; // old behavior break; case kPerfOptions: options.delayed_write_rate = 8 * 1024 * 1024; @@ -523,7 +520,7 @@ Options DBTestBase::GetOptions( } case kBlockBasedTableWithLatestFormat: { // In case different from default - table_options.format_version = kLatestFormatVersion; + table_options.format_version = kLatestBbtFormatVersion; break; } case kOptimizeFiltersForHits: { @@ -591,7 +588,6 @@ Options DBTestBase::GetOptions( options_override.level_compaction_dynamic_level_bytes; options.env = env_; options.create_if_missing = true; - options.fail_if_options_file_error = true; return options; } @@ -668,7 +664,8 @@ Status DBTestBase::TryReopenWithColumnFamilies( DBOptions db_opts = DBOptions(options[0]); last_options_ = options[0]; MaybeInstallTimeElapseOnlySleep(db_opts); - return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + Status s = DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + return s; } Status DBTestBase::TryReopenWithColumnFamilies( @@ -687,8 +684,7 @@ void DBTestBase::Close() { EXPECT_OK(db_->DestroyColumnFamilyHandle(h)); } handles_.clear(); - delete db_; - db_ = nullptr; + db_.reset(); } void DBTestBase::DestroyAndReopen(const Options& options) { @@ -713,7 +709,20 @@ void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) { Status DBTestBase::ReadOnlyReopen(const Options& options) { Close(); MaybeInstallTimeElapseOnlySleep(options); - return DB::OpenForReadOnly(options, dbname_, &db_); + Status s = DB::OpenForReadOnly(options, dbname_, &db_); + return s; +} + +Status DBTestBase::EnforcedReadOnlyReopen(const Options& options) { + Close(); + Options options_copy = options; + MaybeInstallTimeElapseOnlySleep(options_copy); + auto fs_read_only = + std::make_shared(env_->GetFileSystem()); + env_read_only_ = std::make_shared(env_, fs_read_only); + options_copy.env = env_read_only_.get(); + Status s = DB::OpenForReadOnly(options_copy, dbname_, &db_); + return s; } Status DBTestBase::TryReopen(const Options& options) { @@ -728,7 +737,8 @@ Status DBTestBase::TryReopen(const Options& options) { // clears the block cache. last_options_ = options; MaybeInstallTimeElapseOnlySleep(options); - return DB::Open(options, dbname_, &db_); + Status s = DB::Open(options, dbname_, &db_); + return s; } bool DBTestBase::IsDirectIOSupported() { @@ -1148,16 +1158,18 @@ size_t DBTestBase::CountLiveFiles() { } int DBTestBase::NumTableFilesAtLevel(int level, int cf) { - std::string property; - if (cf == 0) { - // default cfd - EXPECT_TRUE(db_->GetProperty( - "rocksdb.num-files-at-level" + std::to_string(level), &property)); - } else { - EXPECT_TRUE(db_->GetProperty( - handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level), - &property)); + return NumTableFilesAtLevel(level, + cf ? handles_[cf] : db_->DefaultColumnFamily()); +} + +int DBTestBase::NumTableFilesAtLevel(int level, ColumnFamilyHandle* cfh, + DB* db) { + if (!db) { + db = db_.get(); } + std::string property; + EXPECT_TRUE(db->GetProperty( + cfh, "rocksdb.num-files-at-level" + std::to_string(level), &property)); return atoi(property.c_str()); } @@ -1190,12 +1202,22 @@ int DBTestBase::TotalTableFiles(int cf, int levels) { // Return spread of files per level std::string DBTestBase::FilesPerLevel(int cf) { - int num_levels = - (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[cf]); + if (cf == 0) { + return FilesPerLevel(db_->DefaultColumnFamily()); + } else { + return FilesPerLevel(handles_[cf]); + } +} + +std::string DBTestBase::FilesPerLevel(ColumnFamilyHandle* cfh, DB* db) { + if (!db) { + db = db_.get(); + } + int num_levels = db->NumberLevels(cfh); std::string result; size_t last_non_zero_offset = 0; for (int level = 0; level < num_levels; level++) { - int f = NumTableFilesAtLevel(level, cf); + int f = NumTableFilesAtLevel(level, cfh, db); char buf[100]; snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); result += buf; @@ -1328,12 +1350,14 @@ void DBTestBase::FillLevels(const std::string& smallest, } void DBTestBase::MoveFilesToLevel(int level, int cf) { + MoveFilesToLevel(level, cf ? handles_[cf] : db_->DefaultColumnFamily()); +} + +void DBTestBase::MoveFilesToLevel(int level, ColumnFamilyHandle* column_family, + DB* db) { + DBImpl* db_impl = db ? static_cast(db) : dbfull(); for (int l = 0; l < level; ++l) { - if (cf > 0) { - EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf])); - } else { - EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr)); - } + EXPECT_OK(db_impl->TEST_CompactRange(l, nullptr, nullptr, column_family)); } } @@ -1852,4 +1876,13 @@ template class TargetCacheChargeTrackingCache< CacheEntryRole::kBlockBasedTableReader>; template class TargetCacheChargeTrackingCache; +const std::vector kKnownTemperatures = { + Temperature::kHot, Temperature::kWarm, Temperature::kCool, + Temperature::kCold, Temperature::kIce}; + +Temperature RandomKnownTemperature() { + return kKnownTemperatures[Random::GetTLSInstance()->Uniform( + static_cast(kKnownTemperatures.size()))]; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_test_util.h b/db/db_test_util.h index 1ddb4faef169..44768f1d1c33 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -452,6 +452,10 @@ class SpecialEnv : public EnvWrapper { return s; } + Status GetFileSize(uint64_t* s) override { + return target_->GetFileSize(s); + } + private: std::unique_ptr target_; anon::AtomicCounter* counter_; @@ -478,6 +482,10 @@ class SpecialEnv : public EnvWrapper { return target_->Prefetch(offset, n); } + Status GetFileSize(uint64_t* s) override { + return target_->GetFileSize(s); + } + private: std::unique_ptr target_; std::atomic* fail_cnt_; @@ -1062,8 +1070,9 @@ class DBTestBase : public testing::Test { MockEnv* mem_env_; Env* encrypted_env_; SpecialEnv* env_; + std::shared_ptr env_read_only_; std::shared_ptr env_guard_; - DB* db_; + std::unique_ptr db_; std::vector handles_; int option_config_; @@ -1148,7 +1157,7 @@ class DBTestBase : public testing::Test { const anon::OptionsOverride& options_override = anon::OptionsOverride()) const; - DBImpl* dbfull() { return static_cast_with_check(db_); } + DBImpl* dbfull() { return static_cast_with_check(db_.get()); } void CreateColumnFamilies(const std::vector& cfs, const Options& options); @@ -1178,6 +1187,9 @@ class DBTestBase : public testing::Test { Status ReadOnlyReopen(const Options& options); + // With a filesystem wrapper that fails on attempted write + Status EnforcedReadOnlyReopen(const Options& options); + Status TryReopen(const Options& options); bool IsDirectIOSupported(); @@ -1268,6 +1280,9 @@ class DBTestBase : public testing::Test { int NumTableFilesAtLevel(int level, int cf = 0); + int NumTableFilesAtLevel(int level, ColumnFamilyHandle* column_family, + DB* db = nullptr); + double CompressionRatioAtLevel(int level, int cf = 0); int TotalTableFiles(int cf = 0, int levels = -1); @@ -1277,6 +1292,8 @@ class DBTestBase : public testing::Test { // Return spread of files per level std::string FilesPerLevel(int cf = 0); + std::string FilesPerLevel(ColumnFamilyHandle* cfh, DB* db = nullptr); + size_t CountFiles(); Status CountFiles(size_t* count); @@ -1308,6 +1325,9 @@ class DBTestBase : public testing::Test { void MoveFilesToLevel(int level, int cf = 0); + void MoveFilesToLevel(int level, ColumnFamilyHandle* column_family, + DB* db = nullptr); + void DumpFileCounts(const char* label); std::string DumpSSTableList(); @@ -1418,20 +1438,23 @@ class DBTestBase : public testing::Test { std::replace(tp_string.begin(), tp_string.end(), ';', ' '); std::replace(tp_string.begin(), tp_string.end(), '=', ' '); ResetTableProperties(tp); - sscanf(tp_string.c_str(), - "# data blocks %" SCNu64 " # entries %" SCNu64 - " # deletions %" SCNu64 " # merge operands %" SCNu64 - " # range deletions %" SCNu64 " raw key size %" SCNu64 - " raw average key size %lf " - " raw value size %" SCNu64 - " raw average value size %lf " - " data block size %" SCNu64 " index block size (user-key? %" SCNu64 - ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, - &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions, - &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size, - &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, - &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded, - &tp->index_size, &tp->filter_size); + int count = sscanf( + tp_string.c_str(), + "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64 + " # merge operands %" SCNu64 " # range deletions %" SCNu64 + " raw key size %" SCNu64 + " raw average key size %lf " + " raw value size %" SCNu64 + " raw average value size %lf " + " data block size %" SCNu64 " data uncompressed size %" SCNu64 + " index block size (user-key? %" SCNu64 ", delta-value? %" SCNu64 + ") %" SCNu64 " filter block size %" SCNu64, + &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions, + &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size, + &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, + &tp->uncompressed_data_size, &tp->index_key_is_user_key, + &tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size); + ASSERT_EQ(count, 15); } private: // Prone to error on direct use @@ -1444,4 +1467,8 @@ class DBTestBase : public testing::Test { // unique ids. void VerifySstUniqueIds(const TablePropertiesCollection& props); +// Excludes kUnknown +extern const std::vector kKnownTemperatures; +Temperature RandomKnownTemperature(); + } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 5a540e4d3321..465f5d0c9632 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -1672,55 +1672,75 @@ TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) { } const int kNumFilesTrigger = 3; Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); - Options options = CurrentOptions(); - options.compaction_style = kCompactionStyleUniversal; - options.max_background_compactions = 2; - options.num_levels = num_levels_; - options.write_buffer_size = 100 << 10; // 100KB - options.target_file_size_base = 32 << 10; // 32KB - options.level0_file_num_compaction_trigger = kNumFilesTrigger; - // Trigger compaction if size amplification exceeds 110% - options.compaction_options_universal.max_size_amplification_percent = 110; - DestroyAndReopen(options); - - // Need to get a token to enable compaction parallelism up to - // `max_background_compactions` jobs. - auto pressure_token = - dbfull()->TEST_write_controler().GetCompactionPressureToken(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {// wait for the full compaction to be picked before adding files intended - // for the second one. - {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool", - "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"}, - // the full (bottom-pri) compaction waits until a partial (low-pri) - // compaction has started to verify they can run in parallel. - {"DBImpl::BackgroundCompaction:NonTrivial", - "DBImpl::BGWorkBottomCompaction"}}); - SyncPoint::GetInstance()->EnableProcessing(); - Random rnd(301); - for (int i = 0; i < 2; ++i) { - for (int num = 0; num < kNumFilesTrigger; num++) { - int key_idx = 0; - GenerateNewFile(&rnd, &key_idx, true /* no_wait */); - // use no_wait above because that one waits for flush and compaction. We - // don't want to wait for compaction because the full compaction is - // intentionally blocked while more files are flushed. - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + for (bool universal_reduce_file_locking : {true, false}) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.reduce_file_locking = + universal_reduce_file_locking; + options.max_background_compactions = 2; + options.num_levels = num_levels_; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + DestroyAndReopen(options); + + // Need to get a token to enable compaction parallelism up to + // `max_background_compactions` jobs. + auto pressure_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + if (universal_reduce_file_locking) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {// Wait for the full compaction to be repicked before adding files + // intended for the second compaction. + {"DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri", + "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"}, + // Wait for the second compaction to run before running the full + // compaction to verify they can run in parallel + {"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri"}}); + } else { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {// Wait for the full compaction to be forwarded before adding files + // intended for the second compaction. + {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool", + "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"}, + // Wait for the second compaction to run before running the full + // compaction to verify they can run in parallel + {"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri"}}); } - if (i == 0) { - TEST_SYNC_POINT( - "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"); + + SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (int num = 0; num < kNumFilesTrigger; num++) { + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx, true /* no_wait */); + // use no_wait above because that one waits for flush and compaction. We + // don't want to wait for compaction because the full compaction is + // intentionally blocked while more files are flushed. + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + if (i == 0) { + TEST_SYNC_POINT( + "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"); + } } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // First compaction should output to bottom level. Second should output to + // L0 since older L0 files pending compaction prevent it from being placed + // lower. + ASSERT_EQ(NumSortedRuns(), 2); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - // First compaction should output to bottom level. Second should output to L0 - // since older L0 files pending compaction prevent it from being placed lower. - ASSERT_EQ(NumSortedRuns(), 2); - ASSERT_GT(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); } @@ -2086,46 +2106,79 @@ TEST_F(DBTestUniversalCompaction2, OverlappingL0) { } TEST_F(DBTestUniversalCompaction2, IngestBehind) { - const int kNumKeys = 3000; - const int kWindowSize = 100; - const int kNumDelsTrigger = 90; - - Options opts = CurrentOptions(); - opts.table_properties_collector_factories.emplace_back( - NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); - opts.compaction_style = kCompactionStyleUniversal; - opts.level0_file_num_compaction_trigger = 2; - opts.compression = kNoCompression; - opts.allow_ingest_behind = true; - opts.compaction_options_universal.size_ratio = 10; - opts.compaction_options_universal.min_merge_width = 2; - opts.compaction_options_universal.max_size_amplification_percent = 200; - Reopen(opts); - - // add an L1 file to prevent tombstones from dropping due to obsolescence - // during flush - int i; - for (i = 0; i < 2000; ++i) { - ASSERT_OK(Put(Key(i), "val")); - } - ASSERT_OK(Flush()); - // MoveFilesToLevel(6); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - for (i = 1999; i < kNumKeys; ++i) { - if (i >= kNumKeys - kWindowSize && - i < kNumKeys - kWindowSize + kNumDelsTrigger) { - ASSERT_OK(Delete(Key(i))); + for (bool cf_option : {false, true}) { + SCOPED_TRACE("cf_option = " + std::to_string(cf_option)); + const int kNumKeys = 3000; + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 2; + opts.compression = kNoCompression; + if (cf_option) { + opts.cf_allow_ingest_behind = true; } else { + opts.allow_ingest_behind = true; + } + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - } - ASSERT_OK(Flush()); + ASSERT_OK(Flush()); + // MoveFilesToLevel(6); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (i = 1999; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + ASSERT_OK(Delete(Key(i))); + } else { + ASSERT_OK(Put(Key(i), "val")); + } + } + ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_EQ(0, NumTableFilesAtLevel(0)); - ASSERT_EQ(0, NumTableFilesAtLevel(6)); - ASSERT_GT(NumTableFilesAtLevel(5), 0); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(6)); + ASSERT_GT(NumTableFilesAtLevel(5), 0); + + if (cf_option) { + // Test that another CF does not allow ingest behind + ColumnFamilyHandle* new_cfh; + Options new_cf_option; + new_cf_option.compaction_style = kCompactionStyleUniversal; + new_cf_option.num_levels = 7; + // CreateColumnFamilies({"new_cf"}, new_cf_option); + ASSERT_OK(db_->CreateColumnFamily(new_cf_option, "new_cf", &new_cfh)); + // handles_.push_back(new_cfh); + for (i = 0; i < 10; ++i) { + // ASSERT_OK(Put(1, Key(i), "val")); + ASSERT_OK(db_->Put(WriteOptions(), new_cfh, Key(i), "val")); + } + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), new_cfh, nullptr, nullptr)); + // This CF can use the last leve. + std::string property; + EXPECT_TRUE(db_->GetProperty( + new_cfh, "rocksdb.num-files-at-level" + std::to_string(6), + &property)); + ASSERT_EQ(1, atoi(property.c_str())); + + ASSERT_OK(db_->DropColumnFamily(new_cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(new_cfh)); + } + } } TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) { diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index f89cfe59463b..1e9270db0dee 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -395,13 +395,13 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndNoFlush) { read_opts.timestamp = &ts_slice; ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, ts_options, persist_udt, avoid_flush_during_recovery)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U); ASSERT_OK(Put(1, "foo", ts1, "v1")); ASSERT_OK(Put(1, "baz", ts1, "v5")); ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt, avoid_flush_during_recovery)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U); // Do a timestamped read with ts1 after second reopen. CheckGet(read_opts, 1, "foo", "v1", ts1); CheckGet(read_opts, 1, "baz", "v5", ts1); @@ -415,7 +415,7 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndNoFlush) { ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt, avoid_flush_during_recovery)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U); std::string ts3; PutFixed64(&ts3, 3); ASSERT_OK(Put(1, "foo", ts3, "v4")); @@ -466,14 +466,14 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) { ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, ts_options, persist_udt)); // No flush, no sst files, because of no data. - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U); ASSERT_OK(Put(1, largest_ukey_without_ts, write_ts, "v1")); ASSERT_OK(Put(1, smallest_ukey_without_ts, write_ts, "v5")); ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt)); // Memtable recovered from WAL flushed because `avoid_flush_during_recovery` // defaults to false, created one L0 file. - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 1U); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 1U); std::vector> level_to_files; dbfull()->TEST_GetFilesMetaData(handles_[1], &level_to_files); @@ -1347,7 +1347,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) { auto tables = ListTableFiles(env_, dbname_); ASSERT_EQ(tables.size(), static_cast(1)); // Make sure 'dobrynia' was flushed: check sst files amount - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"), static_cast(1)); } // New WAL file @@ -1363,16 +1363,16 @@ TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) { options); { // No inserts => default is empty - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(0)); // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), static_cast(5)); // 1 SST for big key + 1 SST for small one - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"), static_cast(2)); // 1 SST for all keys - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(1)); } } @@ -1401,7 +1401,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) { { auto tables = ListTableFiles(env_, dbname_); ASSERT_EQ(tables.size(), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(1)); } // Memtable for 'nikitich' has flushed, new WAL file has opened @@ -1425,7 +1425,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) { { auto tables = ListTableFiles(env_, dbname_); ASSERT_EQ(tables.size(), static_cast(2)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(2)); } @@ -1437,13 +1437,13 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) { // first, second and third WALs went to the same SST. // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for // 'dobrynia', one for 'pikachu' - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"), static_cast(3)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"), static_cast(1)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), static_cast(1)); } } @@ -1521,9 +1521,9 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) { // from an old incarnation of the WAL on recovery ASSERT_OK(db_->PauseBackgroundWork()); ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500))); - ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500))); - ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); ASSERT_OK(db_->ContinueBackgroundWork()); ASSERT_OK(Flush()); ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500))); @@ -1545,13 +1545,13 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) { // gap in sequence numbers to interfere with recovery ASSERT_OK(db_->PauseBackgroundWork()); ASSERT_OK(Put("key1", "val1")); - ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); ASSERT_OK(Put("key2", "val2")); - ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); // Need a gap in sequence numbers, so e.g. ingest external file // with an open snapshot { - ManagedSnapshot snapshot(db_); + ManagedSnapshot snapshot(db_.get()); ASSERT_OK( db_->IngestExternalFile({external_file1}, IngestExternalFileOptions())); } @@ -1560,7 +1560,7 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) { // Need an SST file that is logically after that WAL, so that dropping WAL // data is not a valid point in time. { - ManagedSnapshot snapshot(db_); + ManagedSnapshot snapshot(db_.get()); ASSERT_OK( db_->IngestExternalFile({external_file2}, IngestExternalFileOptions())); } @@ -1613,7 +1613,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) { return s; } - AcqRelAtomic syncs_before_failure_{UINT32_MAX}; + Atomic syncs_before_failure_{UINT32_MAX}; protected: class MyTestWritableFile : public FSWritableFileOwnerWrapper { @@ -1655,10 +1655,10 @@ TEST_F(DBWALTest, SyncWalPartialFailure) { // with a single thread, to exercise as much logic as we reasonably can. ASSERT_OK(db_->PauseBackgroundWork()); ASSERT_OK(Put("key1", "val1")); - ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); ASSERT_OK(db_->SyncWAL()); ASSERT_OK(Put("key2", "val2")); - ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable()); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); ASSERT_OK(Put("key3", "val3")); // Allow 1 of the WALs to sync, but another won't @@ -1746,8 +1746,8 @@ class RecoveryTestHelper { WriteController write_controller; versions.reset(new VersionSet( - test->dbname_, &db_options, file_options, table_cache.get(), - &write_buffer_manager, &write_controller, + test->dbname_, &db_options, MutableDBOptions{options}, file_options, + table_cache.get(), &write_buffer_manager, &write_controller, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", options.daily_offpeak_time_utc, @@ -1879,9 +1879,11 @@ TEST_F(DBWALTest, TrackAndVerifyWALsRecycleWAL) { // Drop `Put("key1", "old_value")` in the first WAL ASSERT_OK(test::TruncateFile(options.env, log_name, 0 /* new_length */)); - Status s = DB::Open(options, dbname_, &db_); + { + Status s = DB::Open(options, dbname_, &db_); - ASSERT_OK(s); + ASSERT_OK(s); + } ASSERT_EQ("wal_to_recycle", Get("key_ignore2")); ASSERT_EQ("NOT_FOUND", Get("key1")); @@ -1979,7 +1981,10 @@ TEST_P(DBWALTrackAndVerifyWALsWithParamsTest, Basic) { ASSERT_OK(options.env->DeleteFile(second_log_name)); } - Status s = DB::Open(options, dbname_, &db_); + Status s; + { + s = DB::Open(options, dbname_, &db_); + } if (i == 0) { ASSERT_OK(s); @@ -2266,17 +2271,17 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); - DB* db1 = nullptr; + std::unique_ptr db1; Status s = DB::OpenForReadOnly(options, dbname_, &db1); ASSERT_OK(s); assert(db1); - delete db1; } TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) { Options options = CurrentOptions(); // Small size to force manifest creation options.max_manifest_file_size = 1; + options.max_manifest_space_amp_pct = 0; options.track_and_verify_wals_in_manifest = true; DestroyAndReopen(options); @@ -3024,13 +3029,13 @@ TEST_F(DBWALTest, GetCompressedWalsAfterSync) { options.wal_compression = kZSTD; DestroyAndReopen(options); - // Write something to memtable and WAL so that log_empty_ will be false after + // Write something to memtable and WAL so that wal_empty_ will be false after // next DB::Open(). ASSERT_OK(Put("a", "v")); Reopen(options); - // New WAL is created, thanks to !log_empty_. + // New WAL is created, thanks to !wal_empty_. ASSERT_OK(dbfull()->TEST_SwitchWAL()); ASSERT_OK(Put("b", "v")); diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index af328707aac7..d4728e9811af 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -19,6 +19,13 @@ #include "utilities/merge_operators/string_append/stringappend2.h" namespace ROCKSDB_NAMESPACE { +namespace { +std::string EncodeAsUint64(uint64_t v) { + std::string dst; + PutFixed64(&dst, v); + return dst; +} +} // namespace class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase { public: DBBasicTestWithTimestamp() @@ -655,7 +662,7 @@ TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) { ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(4, 0), "v2")); ASSERT_OK(db_->Delete(WriteOptions(), "k1", Timestamp(5, 0))); ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(6, 0), "v3")); - check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v3", + check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::OK(), "v3", Timestamp(6, 0)); ASSERT_OK(Flush()); Close(); @@ -668,27 +675,27 @@ TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) { // Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND. ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families, &handles_, &db_, Timestamp(5, 0))); - check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::NotFound(), "", + check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::NotFound(), "", Timestamp(5, 0)); Close(); // Trim data whose timestamp > Timestamp(4, 0), read(k1, ts(7)) <- v2 ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families, &handles_, &db_, Timestamp(4, 0))); - check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v2", + check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::OK(), "v2", Timestamp(4, 0)); Close(); Reopen(options); ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "k1", "k3", Timestamp(7, 0))); - check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::NotFound(), "", + check_value_by_ts(db_.get(), "k1", Timestamp(8, 0), Status::NotFound(), "", Timestamp(7, 0)); Close(); // Trim data whose timestamp > Timestamp(6, 0), read(k1, ts(8)) <- v2 ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families, &handles_, &db_, Timestamp(6, 0))); - check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::OK(), "v2", + check_value_by_ts(db_.get(), "k1", Timestamp(8, 0), Status::OK(), "v2", Timestamp(4, 0)); Close(); } @@ -1420,8 +1427,12 @@ TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) { { std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); WriteBatch batch(0, 0, 0, kTimestampSize); - { ASSERT_OK(batch.Put("a", "new_value")); } - { ASSERT_OK(batch.Put("b", "new_value")); } + { + ASSERT_OK(batch.Put("a", "new_value")); + } + { + ASSERT_OK(batch.Put("b", "new_value")); + } s = batch.UpdateTimestamps( ts_str, [kTimestampSize](uint32_t) { return kTimestampSize; }); ASSERT_OK(s); @@ -1480,13 +1491,24 @@ TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) { Close(); } -TEST_F(DBBasicTestWithTimestamp, - FIXME_ReverseIterationWithBlobAndUnpreparedValue) { +class ReverseIterationWithUnpreparedBlobTest + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface> { + public: + ReverseIterationWithUnpreparedBlobTest() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_reverse_with_unprepare") {} +}; +INSTANTIATE_TEST_CASE_P(ReverseIterationWithUnpreparedBlobTest, + ReverseIterationWithUnpreparedBlobTest, + ::testing::Combine(::testing::Values(true, false), + ::testing::Values(0, 2))); +TEST_P(ReverseIterationWithUnpreparedBlobTest, Basic) { Options options = CurrentOptions(); options.create_if_missing = true; options.env = env_; options.enable_blob_files = true; - options.max_sequential_skip_in_iterations = 0; + options.max_sequential_skip_in_iterations = std::get<1>(GetParam()); const size_t kTimestampSize = Timestamp(0, 0).size(); TestComparator test_cmp(kTimestampSize); @@ -1501,7 +1523,7 @@ TEST_F(DBBasicTestWithTimestamp, for (uint64_t key = 0; key <= kMaxKey; ++key) { for (size_t i = 0; i < write_timestamps.size(); ++i) { ASSERT_OK(db_->Put(WriteOptions(), Key1(key), write_timestamps[i], - "value" + std::to_string(i))); + Key1(key) + "value" + std::to_string(i))); } } @@ -1513,17 +1535,28 @@ TEST_F(DBBasicTestWithTimestamp, ReadOptions read_opts; read_opts.timestamp = &read_timestamp; - read_opts.allow_unprepared_value = true; + read_opts.allow_unprepared_value = std::get<0>(GetParam()); std::unique_ptr it(db_->NewIterator(read_opts)); it->SeekForPrev(Key1(kMaxKey)); - ASSERT_TRUE(it->Valid()); - ASSERT_OK(it->status()); + uint64_t key = kMaxKey; + int count = 0; + while (it->Valid()) { + ASSERT_OK(it->status()); - // FIXME: PrepareValue() should succeed and status() should remain OK - ASSERT_FALSE(it->PrepareValue()); - ASSERT_TRUE(it->status().IsCorruption()); + ASSERT_TRUE(it->PrepareValue()); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ(it->key(), Key1(key)); + ASSERT_EQ(it->timestamp(), Timestamp(3, 0)); + ASSERT_EQ(it->value(), Key1(key) + "value" + std::to_string(1)); + key--; + count++; + it->Prev(); + } + ASSERT_OK(it->status()); + ASSERT_EQ(kMaxKey + 1, count); } Close(); @@ -2371,7 +2404,6 @@ class DataVisibilityTest : public DBBasicTestWithTimestampBase { } } }; -constexpr int DataVisibilityTest::kTestDataSize; // Application specifies timestamp but not snapshot. // reader writer @@ -3746,17 +3778,42 @@ INSTANTIATE_TEST_CASE_P( test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, test::UserDefinedTimestampTestMode::kNormal)); -TEST_F(DBBasicTestWithTimestamp, EnableDisableUDT) { +// Test params: +// 1) whether to flush before close +class EnableDisableUDTTest : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { + public: + EnableDisableUDTTest() + : DBBasicTestWithTimestampBase("/enable_disable_udt") {} +}; + +INSTANTIATE_TEST_CASE_P(EnableDisableUDTTest, EnableDisableUDTTest, + ::testing::Values(true, false)); + +TEST_P(EnableDisableUDTTest, Basic) { Options options = CurrentOptions(); + // Un-flushed data before close will involve a WAL replay on DB reopen. + bool flush_before_close = GetParam(); options.env = env_; - // Create a column family without user-defined timestamps. options.comparator = BytewiseComparator(); options.persist_user_defined_timestamps = true; DestroyAndReopen(options); + ReadOptions ropts; + std::string read_ts; + std::string value; + std::string key_ts; + // Create one SST file, its user keys have no user-defined timestamps. - ASSERT_OK(db_->Put(WriteOptions(), "foo", "val1")); - ASSERT_OK(Flush(0)); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "val0")); + ASSERT_OK(db_->Put(WriteOptions(), "bar", "val0")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ("val0", value); + ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound()); + if (flush_before_close) { + ASSERT_OK(Flush(0)); + } Close(); // Reopen the existing column family and enable user-defined timestamps @@ -3765,47 +3822,63 @@ TEST_F(DBBasicTestWithTimestamp, EnableDisableUDT) { options.persist_user_defined_timestamps = false; options.allow_concurrent_memtable_write = false; Reopen(options); - - std::string value; - ASSERT_TRUE(db_->Get(ReadOptions(), "foo", &value).IsInvalidArgument()); - std::string read_ts; - PutFixed64(&read_ts, 0); - ReadOptions ropts; + // Read data from previous session before and after compaction. + read_ts = EncodeAsUint64(1); Slice read_ts_slice = read_ts; ropts.timestamp = &read_ts_slice; - std::string key_ts; - // Entries in pre-existing SST files are treated as if they have minimum - // user-defined timestamps. - ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts)); - ASSERT_EQ("val1", value); - ASSERT_EQ(read_ts, key_ts); + for (int i = 0; i < 2; i++) { + ASSERT_TRUE(db_->Get(ReadOptions(), "foo", &value).IsInvalidArgument()); + // Entries in pre-existing SST files are treated as if they have minimum + // user-defined timestamps. + ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts)); + ASSERT_EQ("val0", value); + ASSERT_EQ(EncodeAsUint64(0), key_ts); + ASSERT_TRUE(db_->Get(ropts, "bar", &value, &key_ts).IsNotFound()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } // Do timestamped read / write. - std::string write_ts; - PutFixed64(&write_ts, 1); - ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val2")); - read_ts.clear(); - PutFixed64(&read_ts, 1); + ASSERT_OK(db_->Put(WriteOptions(), "foo", EncodeAsUint64(1), "val1")); + ASSERT_OK(db_->Put(WriteOptions(), "bar", EncodeAsUint64(1), "val1")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz", EncodeAsUint64(2))); ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts)); - ASSERT_EQ("val2", value); - ASSERT_EQ(write_ts, key_ts); + ASSERT_EQ("val1", value); + ASSERT_EQ(EncodeAsUint64(1), key_ts); + ASSERT_OK(db_->Get(ropts, "bar", &value, &key_ts)); + ASSERT_EQ("val1", value); + ASSERT_EQ(EncodeAsUint64(1), key_ts); + read_ts = EncodeAsUint64(2); + ASSERT_TRUE(db_->Get(ropts, "bar", &value, &key_ts).IsNotFound()); // The user keys in this SST file don't have user-defined timestamps either, // because `persist_user_defined_timestamps` flag is set to false. - ASSERT_OK(Flush(0)); + if (flush_before_close) { + ASSERT_OK(Flush(0)); + } Close(); // Reopen the existing column family while disabling user-defined timestamps. options.comparator = BytewiseComparator(); Reopen(options); - ASSERT_TRUE(db_->Get(ropts, "foo", &value).IsInvalidArgument()); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &value)); - ASSERT_EQ("val2", value); + // Read data from previous session before and after compaction. + for (int i = 0; i < 2; i++) { + ASSERT_TRUE(db_->Get(ropts, "foo", &value).IsInvalidArgument()); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ("val1", value); + ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } // Continue to write / read the column family without user-defined timestamps. - ASSERT_OK(db_->Put(WriteOptions(), "foo", "val3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "val2")); + ASSERT_OK(db_->Put(WriteOptions(), "bar", "val2")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz")); ASSERT_OK(db_->Get(ReadOptions(), "foo", &value)); - ASSERT_EQ("val3", value); + ASSERT_EQ("val2", value); + ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound()); + if (flush_before_close) { + ASSERT_OK(Flush(0)); + } Close(); } @@ -4844,6 +4917,117 @@ TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) { Close(); } +class GetNewestUserDefinedTimestampTest : public DBBasicTestWithTimestampBase { + public: + explicit GetNewestUserDefinedTimestampTest() + : DBBasicTestWithTimestampBase("get_newest_udt_test") {} +}; + +TEST_F(GetNewestUserDefinedTimestampTest, Basic) { + std::string newest_timestamp; + // UDT disabled, get InvalidArgument. + ASSERT_TRUE(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp) + .IsInvalidArgument()); + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_write_buffer_number = 5; + options.min_write_buffer_number_to_merge = 4; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + + DestroyAndReopen(options); + // UDT persisted, get NotSupported. + ASSERT_TRUE(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp) + .IsNotSupported()); + + options.persist_user_defined_timestamps = false; + options.allow_concurrent_memtable_write = false; + + DestroyAndReopen(options); + ASSERT_TRUE( + db_->GetNewestUserDefinedTimestamp(nullptr, nullptr).IsInvalidArgument()); + + ColumnFamilyHandleImpl* cfh = static_cast_with_check( + db_->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + // The column family hasn't seen any user defined timestamp + ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)); + ASSERT_TRUE(newest_timestamp.empty()); + + ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(1), "val1")); + // Testing get newest timestamp from mutable memtable. + ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)); + ASSERT_EQ(EncodeAsUint64(1), newest_timestamp); + + ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(2), "val2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd)); + // Testing get the newest timestamp from immutable memtable because the + // mutable one is empty. + ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)); + ASSERT_EQ(EncodeAsUint64(2), newest_timestamp); + + ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(3), "val3")); + ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(4), "val4")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd)); + // Testing get the newest timestamp from the more recent immutable memtable + // when there are multiple immutable memtables. + ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)); + ASSERT_EQ(EncodeAsUint64(4), newest_timestamp); + + ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(5), "val5")); + // Testing get newest timestamp from mutable memtable when it has data, in the + // presence of immutable memtables. + ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)); + ASSERT_EQ(EncodeAsUint64(5), newest_timestamp); + + ASSERT_OK(Flush()); + // After flushing and all the user defined timestamp are flushed. User defined + // timestamp info for SST files is available from MANIFEST. + ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)); + ASSERT_EQ(EncodeAsUint64(5), newest_timestamp); + + Reopen(options); + // Similar after flush, when there is no memtables, but some SST files, + // if MANIFEST records the upperbound of flushed timestamps because timestamps + // are not persisted in SST files, this info can be found. + ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)); + ASSERT_EQ(EncodeAsUint64(5), newest_timestamp); + + Close(); +} + +TEST_F(GetNewestUserDefinedTimestampTest, ConcurrentWrites) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.persist_user_defined_timestamps = false; + options.allow_concurrent_memtable_write = false; + + DestroyAndReopen(options); + + std::vector threads; + threads.reserve(10); + std::atomic current_ts{0}; + for (int i = 0; i < 10; i++) { + threads.emplace_back([this, i, ¤t_ts]() { + if (i % 2 == 0) { + std::string newest_timestamp; + ASSERT_OK( + db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)); + } else { + uint64_t write_ts = current_ts.fetch_add(1); + ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(write_ts), + "val" + std::to_string(i))); + } + }); + } + + for (auto& t : threads) { + t.join(); + } + Close(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_with_timestamp_compaction_test.cc b/db/db_with_timestamp_compaction_test.cc index 783140cbf7d9..1e35d43f829c 100644 --- a/db/db_with_timestamp_compaction_test.cc +++ b/db/db_with_timestamp_compaction_test.cc @@ -7,9 +7,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + +#include "db/column_family.h" #include "db/compaction/compaction.h" #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "rocksdb/sst_file_reader.h" #include "test_util/testutil.h" namespace ROCKSDB_NAMESPACE { @@ -48,6 +52,122 @@ class TimestampCompatibleCompactionTest : public DBTestBase { } return value; } + + // Helper to get all files with their level and timestamps + std::vector> + GetAllFileTimestamps() { + std::vector> results; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + auto* cfd = static_cast_with_check(cfh)->cfd(); + auto* vstorage = cfd->current()->storage_info(); + + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (auto* file : vstorage->LevelFiles(level)) { + results.emplace_back(level, file->min_timestamp, file->max_timestamp); + } + } + return results; + } + + // Helper to compute overall min/max timestamps across all files + // Returns {min_ts, max_ts} as uint64_t values + // Asserts that all files have non-empty timestamps + std::pair GetOverallTimestampRange() { + auto files = GetAllFileTimestamps(); + EXPECT_GE(files.size(), 1U); + + uint64_t overall_min = UINT64_MAX; + uint64_t overall_max = 0; + for (const auto& [level, min_ts, max_ts] : files) { + EXPECT_FALSE(min_ts.empty()) << "min_timestamp empty at level " << level; + EXPECT_FALSE(max_ts.empty()) << "max_timestamp empty at level " << level; + + if (!min_ts.empty() && !max_ts.empty()) { + uint64_t file_min = DecodeFixed64(min_ts.data()); + uint64_t file_max = DecodeFixed64(max_ts.data()); + overall_min = std::min(overall_min, file_min); + overall_max = std::max(overall_max, file_max); + } + } + return {overall_min, overall_max}; + } + + // Helper to verify timestamp range matches expected values, including after + // reopen + void VerifyTimestampRangeWithPersistence(const Options& options, + uint64_t expected_min, + uint64_t expected_max) { + // Verify before reopen + auto [min_ts, max_ts] = GetOverallTimestampRange(); + ASSERT_EQ(expected_min, min_ts); + ASSERT_EQ(expected_max, max_ts); + + size_t file_count_before = GetAllFileTimestamps().size(); + + // Verify manifest persistence by reopening + Reopen(options); + + // Verify after reopen + auto [reopened_min_ts, reopened_max_ts] = GetOverallTimestampRange(); + ASSERT_EQ(expected_min, reopened_min_ts); + ASSERT_EQ(expected_max, reopened_max_ts); + ASSERT_EQ(file_count_before, GetAllFileTimestamps().size()); + } + + // Helper to create common options for UDT tests with level compaction + Options CreateTimestampOptions(bool disable_auto_compactions = false) { + Options options = CurrentOptions(); + options.env = env_; + options.compaction_style = kCompactionStyleLevel; + options.num_levels = 4; + options.persist_user_defined_timestamps = true; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.disable_auto_compactions = disable_auto_compactions; + return options; + } + + // Helper to write test data with alternating timestamps in a range + // Writes keys [start_key, end_key) with timestamps alternating between + // min_ts and max_ts + void WriteDataWithTimestampRange(int start_key, int end_key, uint64_t min_ts, + uint64_t max_ts) { + std::string ts_buf; + for (int i = start_key; i < end_key; i++) { + ts_buf.clear(); + uint64_t ts = (i % 2 == 0) ? min_ts : max_ts; + PutFixed64(&ts_buf, ts); + ASSERT_OK(db_->Put(WriteOptions(), Key(i), ts_buf, + "value" + std::to_string(i))); + } + } + + // Helper to check if any file has the expected timestamp range + bool HasFileWithTimestampRange(uint64_t expected_min, uint64_t expected_max) { + auto file_timestamps = GetAllFileTimestamps(); + for (const auto& [level, min_ts, max_ts] : file_timestamps) { + if (!min_ts.empty() && !max_ts.empty()) { + uint64_t file_min = DecodeFixed64(min_ts.data()); + uint64_t file_max = DecodeFixed64(max_ts.data()); + if (file_min == expected_min && file_max == expected_max) { + return true; + } + } + } + return false; + } + + // Helper to verify data is readable with a given timestamp + void VerifyDataReadable(int key, const std::string& expected_value, + uint64_t read_ts) { + std::string value; + std::string ts_buf; + PutFixed64(&ts_buf, read_ts); + ReadOptions read_opts; + Slice ts_slice(ts_buf); + read_opts.timestamp = &ts_slice; + ASSERT_OK(db_->Get(read_opts, Key(key), &value)); + ASSERT_EQ(expected_value, value); + } }; TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) { @@ -344,6 +464,385 @@ TEST_F(TimestampCompatibleCompactionTest, EmptyCompactionOutput) { ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); } +TEST_F(TimestampCompatibleCompactionTest, SeqnoZeroingWithUDT) { + // This test validates that seqno is only zeroed when the timestamp is older + // than full_history_ts_low_. Before the fix, seqno was incorrectly zeroed + // even when UDT was enabled but timestamp wasn't old enough. + + Options options = CurrentOptions(); + options.env = env_; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Track seqno zeroing events and which keys are zeroed + std::set zeroed_keys; + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) { + auto* ikey = static_cast(arg); + ASSERT_EQ(0, ikey->sequence); + // Extract user key without timestamp (last 8 bytes) + Slice user_key_with_ts = ikey->user_key; + std::string user_key = + user_key_with_ts.ToString().substr(0, user_key_with_ts.size() - 8); + zeroed_keys.insert(user_key); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Case 1: Test that seqno is NOT zeroed when full_history_ts_low is not set + // Write a key with timestamp 100 + std::string ts_str = Timestamp(100); + ASSERT_OK(db_->Put(WriteOptions(), "key1", ts_str, "value1")); + ASSERT_OK(Flush()); + + zeroed_keys.clear(); + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + // With UDT enabled and no full_history_ts_low, seqno should NOT be zeroed + ASSERT_TRUE(zeroed_keys.empty()); + + // Case 2: Test that seqno IS zeroed when timestamp < full_history_ts_low + // Write a new key with timestamp 200 + ts_str = Timestamp(200); + ASSERT_OK(db_->Put(WriteOptions(), "key2", ts_str, "value2")); + ASSERT_OK(Flush()); + + zeroed_keys.clear(); + { + // Set full_history_ts_low to 300, so ts < 300 should be zeroed + std::string full_history_ts_low = Timestamp(300); + Slice ts_slice = full_history_ts_low; + CompactRangeOptions cro; + cro.full_history_ts_low = &ts_slice; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + // key1 (ts=100) and key2 (ts=200) both have ts < 300, so both should be + // zeroed + ASSERT_EQ(2u, zeroed_keys.size()); + ASSERT_TRUE(zeroed_keys.count("key1") > 0); + ASSERT_TRUE(zeroed_keys.count("key2") > 0); + + // Case 3: Write a new key with timestamp >= full_history_ts_low + // and verify it is NOT zeroed while old keys are re-zeroed + ts_str = Timestamp(500); + ASSERT_OK(db_->Put(WriteOptions(), "key3", ts_str, "value3")); + ASSERT_OK(Flush()); + + zeroed_keys.clear(); + { + // Set full_history_ts_low to 400 + // key1 (ts=100) and key2 (ts=200) have ts < 400, will be re-processed + // key3 (ts=500) has ts >= 400, should NOT be zeroed + std::string full_history_ts_low = Timestamp(400); + Slice ts_slice = full_history_ts_low; + CompactRangeOptions cro; + cro.full_history_ts_low = &ts_slice; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + // key3 should NOT appear in zeroed_keys since ts=500 >= 400 + ASSERT_TRUE(zeroed_keys.count("key3") == 0); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Verify data is still readable + std::string value; + ts_str = Timestamp(600); + Slice read_ts = ts_str; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + ASSERT_OK(db_->Get(read_opts, "key1", &value)); + ASSERT_EQ("value1", value); + ASSERT_OK(db_->Get(read_opts, "key2", &value)); + ASSERT_EQ("value2", value); + ASSERT_OK(db_->Get(read_opts, "key3", &value)); + ASSERT_EQ("value3", value); +} + +// Test that files with max_timestamp >= full_history_ts_low are not marked +// for bottommost compaction, which prevents infinite compaction loops. +TEST_F(TimestampCompatibleCompactionTest, + BottommostCompactionRespectsFullHistoryTsLow) { + Options options = CreateTimestampOptions(); + options.level0_file_num_compaction_trigger = 4; + + DestroyAndReopen(options); + + // Write some data with timestamps 100-199 + std::string ts_buf; + for (int i = 0; i < 100; i++) { + ts_buf.clear(); + PutFixed64(&ts_buf, 100 + i); + ASSERT_OK( + db_->Put(WriteOptions(), Key(i), ts_buf, "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + // Compact to the bottommost level + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Set full_history_ts_low to 150 - files with max_ts >= 150 should NOT be + // marked for bottommost compaction since seqno cannot be zeroed + ts_buf.clear(); + PutFixed64(&ts_buf, 150); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf)); + + // Release a snapshot to potentially trigger bottommost file marking + // but files should NOT be marked because max_ts (199) >= full_history_ts_low + // (150) + const Snapshot* snap = db_->GetSnapshot(); + db_->ReleaseSnapshot(snap); + + // Wait for any scheduled compactions - should complete without infinite loop + // Use a reasonable timeout to detect infinite loops + WaitForCompactOptions wfc_options; + wfc_options.timeout = std::chrono::microseconds(5000000); // 5 seconds + Status s = dbfull()->WaitForCompact(wfc_options); + // Should succeed without timeout (no infinite compaction loop) + ASSERT_TRUE(s.ok() || s.IsTimedOut()); + if (s.IsTimedOut()) { + // If timeout, the fix is not working - this should not happen + FAIL() << "WaitForCompact timed out - possible infinite compaction loop"; + } + + // Now set full_history_ts_low beyond max timestamp in the file (200+) + // This should allow the file to be properly marked and compacted + ts_buf.clear(); + PutFixed64(&ts_buf, 300); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf)); + + // Trigger another snapshot release to potentially mark files + snap = db_->GetSnapshot(); + db_->ReleaseSnapshot(snap); + + // Now compaction should clean up the file. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); +} + +// Test that files are NOT marked for bottommost compaction when UDT is enabled +// and full_history_ts_low has never been set (empty). +TEST_F(TimestampCompatibleCompactionTest, + BottommostCompactionSkipsWhenFullHistoryTsLowNotSet) { + Options options = CreateTimestampOptions(); + + DestroyAndReopen(options); + + // Write some data with timestamps 100-199 + std::string ts_buf; + for (int i = 0; i < 100; i++) { + ts_buf.clear(); + PutFixed64(&ts_buf, 100 + i); + ASSERT_OK( + db_->Put(WriteOptions(), Key(i), ts_buf, "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + // Compact to the bottommost level without setting full_history_ts_low + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Verify files have valid max_timestamp + auto file_timestamps = GetAllFileTimestamps(); + ASSERT_GE(file_timestamps.size(), 1U); + for (const auto& [level, min_ts, max_ts] : file_timestamps) { + ASSERT_FALSE(max_ts.empty()) << "max_timestamp should not be empty"; + } + + // full_history_ts_low is NOT set (empty), so files should NOT be marked + // for bottommost compaction even after releasing a snapshot. + // This tests the branch: if (full_history_ts_low.empty()) { continue; } + const Snapshot* snap = db_->GetSnapshot(); + db_->ReleaseSnapshot(snap); + + // Wait for any scheduled compactions + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Now set full_history_ts_low to a value > max_timestamp (199) in the file + // This should allow the file to be properly marked and compacted + ts_buf.clear(); + PutFixed64(&ts_buf, 300); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf)); + + // Trigger another snapshot release to potentially mark files + snap = db_->GetSnapshot(); + db_->ReleaseSnapshot(snap); + + // Now compaction should be able to proceed since full_history_ts_low is set + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify data is still readable + VerifyDataReadable(0, "value0", 250); +} + +// Test that ingested SST files created with UDT have their min/max timestamps +// properly extracted from table properties and populated in FileMetaData. +// This verifies the fix in external_sst_file_ingestion_job.cc that calls +// ExtractTimestampFromTableProperties after creating FileMetaData. +TEST_F(TimestampCompatibleCompactionTest, + IngestedFileTimestampsExtractedFromTableProperties) { + Options options = CreateTimestampOptions(); + + DestroyAndReopen(options); + + // Create an SST file WITH timestamps using SstFileWriter + std::string sst_file = dbname_ + "/ingested_udt_file.sst"; + const uint64_t kMinTs = 100; + const uint64_t kMaxTs = 200; + + { + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(sst_file)); + + std::string ts_buf; + for (int i = 0; i < 10; i++) { + // Alternate between min and max timestamps + uint64_t ts = (i % 2 == 0) ? kMinTs : kMaxTs; + ts_buf.clear(); + PutFixed64(&ts_buf, ts); + // SstFileWriter with UDT comparator requires key with timestamp + ASSERT_OK( + sst_file_writer.Put(Key(i), ts_buf, "value" + std::to_string(i))); + } + ASSERT_OK(sst_file_writer.Finish()); + } + + // Verify the SST file has timestamp properties before ingestion + { + std::unique_ptr reader(new SstFileReader(options)); + ASSERT_OK(reader->Open(sst_file)); + auto props = reader->GetTableProperties(); + auto& user_collected = props->user_collected_properties; + ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") != + user_collected.end()) + << "SST file should have rocksdb.timestamp_min property"; + ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") != + user_collected.end()) + << "SST file should have rocksdb.timestamp_max property"; + } + + // Ingest the SST file + IngestExternalFileOptions ifo; + ifo.move_files = false; + ASSERT_OK(db_->IngestExternalFile({sst_file}, ifo)); + + // Verify the ingested file has proper timestamps in FileMetaData + ASSERT_TRUE(HasFileWithTimestampRange(kMinTs, kMaxTs)) + << "Ingested file should have min_timestamp=" << kMinTs + << " and max_timestamp=" << kMaxTs << " in FileMetaData"; + + // Verify timestamps persist after reopen + Reopen(options); + + ASSERT_TRUE(HasFileWithTimestampRange(kMinTs, kMaxTs)) + << "Ingested file timestamps should persist after reopen"; + + // Verify data is readable + VerifyDataReadable(0, "value0", kMaxTs); + + // Clean up + ASSERT_OK(env_->DeleteFile(sst_file)); +} + +// Test that min/max timestamps are correctly tracked in FileMetaData and +// persisted in the manifest during flush. +TEST_F(TimestampCompatibleCompactionTest, TimestampRangePersistenceFlush) { + Options options = CreateTimestampOptions(); + + DestroyAndReopen(options); + + // Expected timestamp range + const uint64_t kMinTs = 100; + const uint64_t kMaxTs = 200; + + // Write data with specific timestamp range + WriteDataWithTimestampRange(0, 50, kMinTs, kMaxTs); + ASSERT_OK(Flush()); + + // First verify table properties have the timestamps + // (this confirms TimestampTablePropertiesCollector is working) + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(1U, props.size()); + for (const auto& item : props) { + auto& user_collected = item.second->user_collected_properties; + ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") != + user_collected.end()); + ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") != + user_collected.end()); + // Verify the collected timestamps match expected values + std::string collected_min_ts = user_collected.at("rocksdb.timestamp_min"); + std::string collected_max_ts = user_collected.at("rocksdb.timestamp_max"); + ASSERT_EQ(kMinTs, DecodeFixed64(collected_min_ts.data())); + ASSERT_EQ(kMaxTs, DecodeFixed64(collected_max_ts.data())); + } + + // Verify FileMetaData timestamps and persistence through reopen + VerifyTimestampRangeWithPersistence(options, kMinTs, kMaxTs); + + // Verify we can still read the data + VerifyDataReadable(0, "value0", kMaxTs); +} + +// Test that min/max timestamps are correctly merged during compaction +// and persisted in the manifest. +TEST_F(TimestampCompatibleCompactionTest, TimestampRangePersistenceCompaction) { + Options options = CreateTimestampOptions(true /* disable_auto_compactions */); + + DestroyAndReopen(options); + + // Create multiple L0 files with different timestamp ranges + // File 1: timestamps 100-150 + const uint64_t kFile1MinTs = 100; + const uint64_t kFile1MaxTs = 150; + WriteDataWithTimestampRange(0, 10, kFile1MinTs, kFile1MaxTs); + ASSERT_OK(Flush()); + + // File 2: timestamps 50-80 (earlier range) + const uint64_t kFile2MinTs = 50; + const uint64_t kFile2MaxTs = 80; + WriteDataWithTimestampRange(10, 20, kFile2MinTs, kFile2MaxTs); + ASSERT_OK(Flush()); + + // File 3: timestamps 200-300 (later range) + const uint64_t kFile3MinTs = 200; + const uint64_t kFile3MaxTs = 300; + WriteDataWithTimestampRange(20, 30, kFile3MinTs, kFile3MaxTs); + ASSERT_OK(Flush()); + + // Expected combined range: min=50, max=300 + const uint64_t kExpectedMinTs = 50; + const uint64_t kExpectedMaxTs = 300; + + // Verify we have 3 L0 files before compaction with valid timestamps + auto files_before = GetAllFileTimestamps(); + ASSERT_EQ(3U, files_before.size()); + for (const auto& [level, min_ts, max_ts] : files_before) { + ASSERT_EQ(0, level); // All files should be in L0 + ASSERT_FALSE(min_ts.empty()); + ASSERT_FALSE(max_ts.empty()); + } + + // Trigger compaction + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify timestamp range and persistence through reopen + VerifyTimestampRangeWithPersistence(options, kExpectedMinTs, kExpectedMaxTs); + + // Verify data is still readable + VerifyDataReadable(0, "value0", kExpectedMaxTs); + VerifyDataReadable(15, "value15", kExpectedMaxTs); + VerifyDataReadable(25, "value25", kExpectedMaxTs); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc index db4bf2b8a289..2eff1d397f7e 100644 --- a/db/db_write_buffer_manager_test.cc +++ b/db/db_write_buffer_manager_test.cc @@ -183,11 +183,11 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { // is waiting to be finished but DBs tries to write meanwhile. TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { std::vector dbnames; - std::vector dbs; + std::vector> dbs; int num_dbs = 3; for (int i = 0; i < num_dbs; i++) { - dbs.push_back(nullptr); + dbs.emplace_back(); dbnames.push_back( test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); } @@ -266,7 +266,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { // Last writer will write and when its blocked it will signal Flush to // continue to clear the stall. - threads.emplace_back(write_db, db_); + threads.emplace_back(write_db, db_.get()); // Wait untill first DB is blocked and then create the multiple writers for // different DBs which will be blocked from getting added to the queue because // stall is in effect. @@ -277,7 +277,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { } } for (int i = 0; i < num_dbs; i++) { - threads.emplace_back(write_db, dbs[i]); + threads.emplace_back(write_db, dbs[i].get()); } for (auto& t : threads) { t.join(); @@ -289,7 +289,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { for (int i = 0; i < num_dbs; i++) { ASSERT_OK(dbs[i]->Close()); ASSERT_OK(DestroyDB(dbnames[i], options)); - delete dbs[i]; + dbs[i].reset(); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -300,11 +300,11 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { // blocked when stall by WriteBufferManager is in effect. TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { std::vector dbnames; - std::vector dbs; + std::vector> dbs; int num_dbs = 3; for (int i = 0; i < num_dbs; i++) { - dbs.push_back(nullptr); + dbs.emplace_back(); dbnames.push_back( test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); } @@ -407,7 +407,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { // | // Last writer thread will write and when its blocked it will signal Flush to // continue to clear the stall. - threads.emplace_back(write_db, db_); + threads.emplace_back(write_db, db_.get()); // Wait untill first thread is blocked and then create the multiple writer // threads. { @@ -421,7 +421,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { // Write to multiple columns of db_. writer_threads.emplace_back(write_cf, i % 3); // Write to different dbs. - threads.emplace_back(write_db, dbs[i]); + threads.emplace_back(write_db, dbs[i].get()); } for (auto& t : threads) { t.join(); @@ -441,7 +441,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { for (int i = 0; i < num_dbs; i++) { ASSERT_OK(dbs[i]->Close()); ASSERT_OK(DestroyDB(dbnames[i], options)); - delete dbs[i]; + dbs[i].reset(); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -604,11 +604,11 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) { // dbs by passing different values to WriteOption.no_slown_down. TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { std::vector dbnames; - std::vector dbs; + std::vector> dbs; int num_dbs = 4; for (int i = 0; i < num_dbs; i++) { - dbs.push_back(nullptr); + dbs.emplace_back(); dbnames.push_back( test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); } @@ -732,7 +732,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { // | // Last writer thread will write and when its blocked/return it will signal // Flush to continue to clear the stall. - threads.emplace_back(write_slow_down, db_); + threads.emplace_back(write_slow_down, db_.get()); // Wait untill first thread writing to DB is blocked and then // create the multiple writers. { @@ -744,11 +744,11 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { for (int i = 0; i < num_dbs; i += 2) { // Write to multiple columns of db_. - writer_threads.emplace_back(write_slow_down, db_); - writer_threads.emplace_back(write_no_slow_down, db_); + writer_threads.emplace_back(write_slow_down, db_.get()); + writer_threads.emplace_back(write_no_slow_down, db_.get()); // Write to different DBs. - threads.emplace_back(write_slow_down, dbs[i]); - threads.emplace_back(write_no_slow_down, dbs[i + 1]); + threads.emplace_back(write_slow_down, dbs[i].get()); + threads.emplace_back(write_no_slow_down, dbs[i + 1].get()); } for (auto& t : threads) { @@ -773,7 +773,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { for (int i = 0; i < num_dbs; i++) { ASSERT_OK(dbs[i]->Close()); ASSERT_OK(DestroyDB(dbnames[i], options)); - delete dbs[i]; + dbs[i].reset(); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -809,7 +809,7 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) { Reopen(options); std::string dbname = test::PerThreadDBPath("db_shared_wbm_db"); - DB* shared_wbm_db = nullptr; + std::unique_ptr shared_wbm_db; ASSERT_OK(DestroyDB(dbname, options)); ASSERT_OK(DB::Open(options, dbname, &shared_wbm_db)); @@ -842,7 +842,7 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) { sleeping_task_high.WaitUntilDone(); ASSERT_OK(shared_wbm_db->Close()); ASSERT_OK(DestroyDB(dbname, options)); - delete shared_wbm_db; + shared_wbm_db.reset(); } TEST_F(DBWriteBufferManagerTest, RuntimeChangeableAllowStall) { diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 2dfcd864f5a5..97fb86c14c2c 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -741,7 +741,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) { ExternalSstFileInfo external_info; ASSERT_OK(sst_file_writer.Finish(&external_info)); } - AcqRelAtomic parallel_ingest_completed{false}; + Atomic parallel_ingest_completed{false}; port::Thread parallel_ingest{[&]() { IngestExternalFileOptions ingest_opts; ingest_opts.move_files = true; // faster than copy @@ -750,7 +750,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) { parallel_ingest_completed.Store(true); }}; - AcqRelAtomic flush_completed{false}; + Atomic flush_completed{false}; port::Thread parallel_flush{[&]() { FlushOptions flush_opts; // NB: Flush with wait=false case is tested above in LockWALInEffect @@ -762,7 +762,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) { flush_completed.Store(true); }}; - AcqRelAtomic parallel_put_completed{false}; + Atomic parallel_put_completed{false}; port::Thread parallel_put{[&]() { // This can make certain failure scenarios more likely: // sleep(1); @@ -987,7 +987,7 @@ TEST_P(DBWriteTest, RecycleLogToggleTest) { options.recycle_log_file_num = 1; Reopen(options); - // 1.log is added to alive_log_files_ + // 1.log is added to alive_wal_files_ ASSERT_OK(Put(Key(2), "val1")); ASSERT_OK(Flush()); // 1.log should be deleted and not recycled, since it @@ -1000,6 +1000,80 @@ TEST_P(DBWriteTest, RecycleLogToggleTest) { ASSERT_EQ(Get(Key(1)), "val2"); } +TEST_P(DBWriteTest, IngestWriteBatchWithIndex) { + if (GetParam() == kPipelinedWrite) { + return; + } + + Options options = GetOptions(); + options.disable_auto_compactions = true; + Reopen(options); + Options cf_options = GetOptions(); + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); + CreateColumnFamilies({"cf1", "cf2"}, cf_options); + ReopenWithColumnFamilies({"default", "cf1", "cf2"}, + {options, cf_options, cf_options}); + + // default cf + auto wbwi1 = std::make_shared(options.comparator, 0, + /*overwrite_key=*/true); + ASSERT_OK(wbwi1->Put("key1", "value1")); + ASSERT_OK(wbwi1->Put("key2", "value2")); + if (GetParam() == kPipelinedWrite) { + ASSERT_TRUE(db_->IngestWriteBatchWithIndex({}, wbwi1).IsNotSupported()); + return; + } + // Test disableWAL=false + ASSERT_TRUE(db_->IngestWriteBatchWithIndex({}, wbwi1).IsNotSupported()); + + WriteOptions wo; + wo.disableWAL = true; + ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi1)); + ASSERT_EQ("value1", Get("key1")); + ASSERT_EQ("value2", Get("key2")); + + // Test with overwrites + auto wbwi = std::make_shared(options.comparator, 0, + /*overwrite_key=*/true); + ASSERT_OK(wbwi->Put("key2", "value3")); + ASSERT_OK(wbwi->Delete("key1")); // Delete an existing key + ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi)); + ASSERT_EQ("NOT_FOUND", Get("key1")); + ASSERT_EQ("value3", Get("key2")); + + auto wbwi2 = std::make_shared(options.comparator, 0, + /*overwrite_key=*/true); + ASSERT_OK(wbwi2->Put(handles_[1], "cf1_key1", "cf1_value1")); + ASSERT_OK(wbwi2->Delete(handles_[1], "cf1_key2")); + // Test ingestion with column family + ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi2)); + ASSERT_EQ("cf1_value1", Get(1, "cf1_key1")); + ASSERT_EQ("NOT_FOUND", Get(1, "cf1_key2")); + + auto wbwi3 = std::make_shared(options.comparator, 0, + /*overwrite_key=*/true); + ASSERT_OK(wbwi3->Merge(handles_[2], "cf2_key1", "cf2_value1")); + ASSERT_OK(wbwi3->Merge(handles_[2], "cf2_key1", "cf2_value2")); + // Test ingestion with merge operations + ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi3)); + ASSERT_EQ("cf2_value1,cf2_value2", Get(2, "cf2_key1")); + + // Test with overwrite_key = false + auto wbwi_no_overwrite = std::make_shared( + options.comparator, 0, /*overwrite_key=*/false); + ASSERT_OK(wbwi_no_overwrite->Put("key1", "value1")); + Status s = db_->IngestWriteBatchWithIndex(wo, wbwi_no_overwrite); + ASSERT_TRUE(s.IsNotSupported()); + + auto empty_wbwi = std::make_shared( + options.comparator, 0, /*overwrite_key=*/true); + ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, empty_wbwi)); + + DestroyAndReopen(options); + // Should fail when trying to ingest to non-existent column family + ASSERT_NOK(db_->IngestWriteBatchWithIndex(wo, wbwi2)); +} + INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, testing::Values(DBTestBase::kDefault, DBTestBase::kConcurrentWALWrites, diff --git a/db/dbformat.h b/db/dbformat.h index 3dfb077397ed..e1b9342ff430 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -83,6 +83,8 @@ extern const ValueType kValueTypeForSeekForPrev; // A range of user keys used internally by RocksDB. Also see `Range` used by // public APIs. +// TODO: merge with Range in pubic API, but this is generally inclusive limit +// and it is maybe exclusive limit struct UserKeyRange { // In case of user_defined timestamp, if enabled, `start` and `limit` should // include user_defined timestamps. @@ -93,18 +95,17 @@ struct UserKeyRange { UserKeyRange(const Slice& s, const Slice& l) : start(s), limit(l) {} }; -// A range of user keys used internally by RocksDB. Also see `RangePtr` used by +// A range of user keys used internally by RocksDB. Also see `RangeOpt` used by // public APIs. -struct UserKeyRangePtr { +struct UserKeyRangeOpt { // In case of user_defined timestamp, if enabled, `start` and `limit` should // point to key with timestamp part. // An optional range start, if missing, indicating a start before all keys. - std::optional start; + OptSlice start; // An optional range end, if missing, indicating an end after all keys. - std::optional limit; + OptSlice limit; - UserKeyRangePtr(const std::optional& s, const std::optional& l) - : start(s), limit(l) {} + UserKeyRangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {} }; // Checks whether a type is an inline value type @@ -469,6 +470,7 @@ class InternalKey { Slice user_key() const { return ExtractUserKey(rep_); } size_t size() const { return rep_.size(); } + bool unset() const { return rep_.empty(); } void Set(const Slice& _user_key, SequenceNumber s, ValueType t) { SetFrom(ParsedInternalKey(_user_key, s, t)); @@ -978,11 +980,6 @@ class InternalKeySliceTransform : public SliceTransform { return transform_->InDomain(user_key); } - bool InRange(const Slice& dst) const override { - auto user_key = ExtractUserKey(dst); - return transform_->InRange(user_key); - } - const SliceTransform* user_prefix_extractor() const { return transform_; } private: diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index ab31e5a6f087..674e01307f19 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -333,6 +333,50 @@ TEST_F(FormatTest, ReplaceInternalKeyWithMinTimestamp) { ASSERT_EQ(kTypeValue, new_key.type); } +TEST(RocksdbVersionTest, Version) { + // Test preprocessor macros for versioning + ASSERT_GT(ROCKSDB_MAJOR, 0); + ASSERT_GE(ROCKSDB_MINOR, 0); + ASSERT_GE(ROCKSDB_PATCH, 0); + ASSERT_LT(ROCKSDB_MAJOR, 1000); + ASSERT_LT(ROCKSDB_MINOR, 1000); + ASSERT_LT(ROCKSDB_PATCH, 1000); + ASSERT_EQ(ROCKSDB_MAKE_VERSION_INT(123, 456, 789), 123456789); + ASSERT_GT(ROCKSDB_VERSION_INT, 9999999); + ASSERT_LT(ROCKSDB_VERSION_INT, 99999999); + static_assert(ROCKSDB_VERSION_GE(9, 8, 7)); + static_assert( + ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH)); + static_assert( + ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH - 1)); + static_assert( + ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH - 100)); + static_assert( + ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR - 1, ROCKSDB_PATCH + 1)); + static_assert(ROCKSDB_VERSION_GE(ROCKSDB_MAJOR - 1, ROCKSDB_MINOR + 1, + ROCKSDB_PATCH + 1)); + static_assert( + !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 1)); + static_assert( + !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 100)); + static_assert( + !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR + 1, ROCKSDB_PATCH - 1)); + static_assert(!ROCKSDB_VERSION_GE(ROCKSDB_MAJOR + 1, ROCKSDB_MINOR - 1, + ROCKSDB_PATCH - 1)); + // More typical usage (but with literal numbers based on relevant API + // features) +#if ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH) + static_assert(true); +#else + static_assert(false); +#endif +#if !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 1) + static_assert(true); +#else + static_assert(false); +#endif +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/error_handler.cc b/db/error_handler.cc index 24c555764f30..1e777fd42600 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -275,9 +275,6 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err, return; } - ROCKS_LOG_INFO(db_options_.info_log, - "ErrorHandler: Set regular background error\n"); - bool paranoid = db_options_.paranoid_checks; Status::Severity sev = Status::Severity::kFatalError; Status new_bg_err; @@ -335,12 +332,21 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err, if (!s.ok() && (s.severity() > bg_error_.severity())) { bg_error_ = s; } else { + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Hit less severe background error\n"); + // This error is less severe than previously encountered error. Don't // take any further action return; } } + bool stop = bg_error_.severity() >= Status::Severity::kHardError; + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Set regular background error, auto_recovery=%d, stop=%d\n", + int{auto_recovery}, int{stop}); + recover_context_ = context; if (auto_recovery) { recovery_in_prog_ = true; @@ -351,7 +357,7 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err, RecoverFromNoSpace(); } } - if (bg_error_.severity() >= Status::Severity::kHardError) { + if (stop) { is_db_stopped_.store(true, std::memory_order_release); } } diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index 57c3c0dcdd88..26263011ffde 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -1550,7 +1550,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { std::vector fault_fs; std::vector options; std::vector> listener; - std::vector db; + std::vector> db; std::shared_ptr sfm(NewSstFileManager(def_env)); int kNumDbInstances = 3; Random rnd(301); @@ -1567,7 +1567,6 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { options[i].writable_file_max_buffer_size = 32768; options[i].listeners.emplace_back(listener[i]); options[i].sst_file_manager = sfm; - DB* dbptr; char buf[16]; listener[i]->EnableAutoRecovery(); @@ -1576,8 +1575,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { IOStatus::NoSpace("Out of space")); snprintf(buf, sizeof(buf), "_%d", i); ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); - ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr)); - db.emplace_back(dbptr); + ASSERT_OK( + DB::Open(options[i], dbname_ + std::string(buf), &db.emplace_back())); } for (auto i = 0; i < kNumDbInstances; ++i) { @@ -1609,7 +1608,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { } for (auto i = 0; i < kNumDbInstances; ++i) { - Status s = static_cast(db[i])->TEST_WaitForCompact(); + Status s = static_cast(db[i].get())->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), Status::Severity::kSoftError); fault_fs[i]->SetFilesystemActive(true); } @@ -1618,7 +1617,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { for (auto i = 0; i < kNumDbInstances; ++i) { std::string prop; ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); - ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact()); + ASSERT_OK(static_cast(db[i].get())->TEST_WaitForCompact()); EXPECT_TRUE(db[i]->GetProperty( "rocksdb.num-files-at-level" + std::to_string(0), &prop)); EXPECT_EQ(atoi(prop.c_str()), 0); @@ -1634,7 +1633,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { for (auto i = 0; i < kNumDbInstances; ++i) { char buf[16]; snprintf(buf, sizeof(buf), "_%d", i); - delete db[i]; + db[i].reset(); fault_fs[i]->SetFilesystemActive(true); if (getenv("KEEP_DB")) { printf("DB is still at %s%s\n", dbname_.c_str(), buf); @@ -1657,7 +1656,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { std::vector fault_fs; std::vector options; std::vector> listener; - std::vector db; + std::vector> db; std::shared_ptr sfm(NewSstFileManager(def_env)); int kNumDbInstances = 3; Random rnd(301); @@ -1674,7 +1673,6 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { options[i].writable_file_max_buffer_size = 32768; options[i].listeners.emplace_back(listener[i]); options[i].sst_file_manager = sfm; - DB* dbptr; char buf[16]; listener[i]->EnableAutoRecovery(); @@ -1695,8 +1693,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { } snprintf(buf, sizeof(buf), "_%d", i); ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); - ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr)); - db.emplace_back(dbptr); + ASSERT_OK( + DB::Open(options[i], dbname_ + std::string(buf), &db.emplace_back())); } for (auto i = 0; i < kNumDbInstances; ++i) { @@ -1732,7 +1730,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { } for (auto i = 0; i < kNumDbInstances; ++i) { - Status s = static_cast(db[i])->TEST_WaitForCompact(); + Status s = static_cast(db[i].get())->TEST_WaitForCompact(); switch (i) { case 0: ASSERT_EQ(s.severity(), Status::Severity::kSoftError); @@ -1754,7 +1752,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); } if (i == 1) { - ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact()); + ASSERT_OK(static_cast(db[i].get())->TEST_WaitForCompact()); } EXPECT_TRUE(db[i]->GetProperty( "rocksdb.num-files-at-level" + std::to_string(0), &prop)); @@ -1772,7 +1770,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { char buf[16]; snprintf(buf, sizeof(buf), "_%d", i); fault_fs[i]->SetFilesystemActive(true); - delete db[i]; + db[i].reset(); if (getenv("KEEP_DB")) { printf("DB is still at %s%s\n", dbname_.c_str(), buf); } else { diff --git a/db/event_helpers.cc b/db/event_helpers.cc index 2b901f6adc06..5c69f3fb81c6 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -77,7 +77,12 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( TableFileCreationReason reason, const Status& s, const std::string& file_checksum, const std::string& file_checksum_func_name) { - if (s.ok() && event_logger) { + if (!event_logger && listeners.empty()) { + s.PermitUncheckedError(); + return; + } + + if (event_logger) { JSONWriter jwriter; AppendCurrentTime(&jwriter); jwriter << "cf_name" << cf_name << "job" << job_id << "event" @@ -124,6 +129,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( << "user_defined_timestamps_persisted" << table_properties.user_defined_timestamps_persisted << "key_largest_seqno" << table_properties.key_largest_seqno + << "key_smallest_seqno" << table_properties.key_smallest_seqno << "merge_operator" << table_properties.merge_operator_name << "prefix_extractor_name" << table_properties.prefix_extractor_name << "property_collectors" @@ -165,6 +171,8 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( jwriter << "oldest_blob_file_number" << oldest_blob_file_number; } + jwriter << "status" << s.ToString(); + jwriter.EndObject(); event_logger->Log(jwriter); @@ -195,18 +203,22 @@ void EventHelpers::LogAndNotifyTableFileDeletion( const std::string& file_path, const Status& status, const std::string& dbname, const std::vector>& listeners) { - JSONWriter jwriter; - AppendCurrentTime(&jwriter); - - jwriter << "job" << job_id << "event" << "table_file_deletion" - << "file_number" << file_number; - if (!status.ok()) { - jwriter << "status" << status.ToString(); + if (!event_logger && listeners.empty()) { + status.PermitUncheckedError(); + return; } - jwriter.EndObject(); + if (event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + + jwriter << "job" << job_id << "event" << "table_file_deletion" + << "file_number" << file_number << "status" << status.ToString(); - event_logger->Log(jwriter); + jwriter.EndObject(); + + event_logger->Log(jwriter); + } if (listeners.empty()) { return; @@ -274,7 +286,12 @@ void EventHelpers::LogAndNotifyBlobFileCreationFinished( const std::string& file_checksum, const std::string& file_checksum_func_name, uint64_t total_blob_count, uint64_t total_blob_bytes) { - if (s.ok() && event_logger) { + if (!event_logger && listeners.empty()) { + s.PermitUncheckedError(); + return; + } + + if (event_logger) { JSONWriter jwriter; AppendCurrentTime(&jwriter); jwriter << "cf_name" << cf_name << "job" << job_id << "event" @@ -305,15 +322,17 @@ void EventHelpers::LogAndNotifyBlobFileDeletion( const std::vector>& listeners, int job_id, uint64_t file_number, const std::string& file_path, const Status& status, const std::string& dbname) { + if (!event_logger && listeners.empty()) { + status.PermitUncheckedError(); + return; + } + if (event_logger) { JSONWriter jwriter; AppendCurrentTime(&jwriter); jwriter << "job" << job_id << "event" << "blob_file_deletion" - << "file_number" << file_number; - if (!status.ok()) { - jwriter << "status" << status.ToString(); - } + << "file_number" << file_number << "status" << status.ToString(); jwriter.EndObject(); event_logger->Log(jwriter); diff --git a/db/experimental.cc b/db/experimental.cc index 3691cfe8f741..b6efc1a47534 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -57,7 +57,8 @@ Status GetFileChecksumsFromCurrentManifest(FileSystem* fs, } assert(checksum_list); - const ReadOptions read_options(Env::IOActivity::kReadManifest); + const ReadOptions read_options( + Env::IOActivity::kGetFileChecksumsFromCurrentManifest); checksum_list->reset(); std::unique_ptr file_reader; @@ -87,11 +88,12 @@ Status GetFileChecksumsFromCurrentManifest(FileSystem* fs, // Read all records from the manifest file... uint64_t manifest_file_size = std::numeric_limits::max(); - FileChecksumRetriever retriever(read_options, manifest_file_size, - *checksum_list); + FileChecksumRetriever retriever(read_options, manifest_file_size); retriever.Iterate(reader, &s); - - return retriever.status(); + if (!retriever.status().ok()) { + return retriever.status(); + } + return retriever.FetchFileChecksumList(*checksum_list); } Status UpdateManifestForFilesState( @@ -156,15 +158,17 @@ Status UpdateManifestForFilesState( // Current state inconsistent with manifest ++files_updated; edit.DeleteFile(level, number); - edit.AddFile( - level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(), - lf->smallest, lf->largest, lf->fd.smallest_seqno, - lf->fd.largest_seqno, lf->marked_for_compaction, temp, - lf->oldest_blob_file_number, lf->oldest_ancester_time, - lf->file_creation_time, lf->epoch_number, lf->file_checksum, - lf->file_checksum_func_name, lf->unique_id, - lf->compensated_range_deletion_size, lf->tail_size, - lf->user_defined_timestamps_persisted); + edit.AddFile(level, lf->fd.GetNumber(), lf->fd.GetPathId(), + lf->fd.GetFileSize(), lf->smallest, lf->largest, + lf->fd.smallest_seqno, lf->fd.largest_seqno, + lf->marked_for_compaction, temp, + lf->oldest_blob_file_number, + lf->oldest_ancester_time, lf->file_creation_time, + lf->epoch_number, lf->file_checksum, + lf->file_checksum_func_name, lf->unique_id, + lf->compensated_range_deletion_size, lf->tail_size, + lf->user_defined_timestamps_persisted, + lf->min_timestamp, lf->max_timestamp); } } } else { @@ -1184,7 +1188,8 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager { break; default: // TODO? Report problem - {} + { + } // Unknown filter type } if (!may_match) { diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 69b2668aea80..326b3d567a09 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -16,6 +16,7 @@ #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/defer.h" +#include "util/file_checksum_helper.h" #include "util/random.h" #include "utilities/fault_injection_env.h" @@ -260,55 +261,6 @@ TEST_F(ExternalSSTFileBasicTest, Basic) { s = sst_file_writer.DeleteRange(Key(100), Key(200)); ASSERT_NOK(s) << s.ToString(); - DestroyAndReopen(options); - - SyncPoint::GetInstance()->LoadDependency({ - {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter", - "ExternalSSTFileBasicTest.LiveWriteStart"}, - {"WriteThread::JoinBatchGroup:Wait", - "DBImpl::IngestExternalFile:AfterIncIngestFileCounter:2"}, - }); - SyncPoint::GetInstance()->EnableProcessing(); - PerfContext* write_thread_perf_context; - std::thread write_thread([&] { - TEST_SYNC_POINT("ExternalSSTFileBasicTest.LiveWriteStart"); - SetPerfLevel(kEnableWait); - write_thread_perf_context = get_perf_context(); - write_thread_perf_context->Reset(); - ASSERT_OK(db_->Put(WriteOptions(), "bar", "v2")); - ASSERT_GT(write_thread_perf_context->write_thread_wait_nanos, 0); - // Test sync points were used to make sure this live write enter write - // thread after the file ingestion entered write thread. So by the time this - // live write finishes, the latest seqno is 1 means file ingestion used - // seqno 0. - ASSERT_EQ(db_->GetLatestSequenceNumber(), 1U); - }); - - // Add file using file path - SetPerfLevel(kEnableTimeExceptForMutex); - PerfContext* perf_ctx = get_perf_context(); - perf_ctx->Reset(); - s = DeprecatedAddFile({file1}); - ASSERT_GT(perf_context.file_ingestion_nanos, 0); - ASSERT_GT(perf_context.file_ingestion_blocking_live_writes_nanos, 0); - ASSERT_OK(s) << s.ToString(); - for (int k = 0; k < 100; k++) { - ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); - } - - write_thread.join(); - SyncPoint::GetInstance()->DisableProcessing(); - - // Re-ingest the file just to check the perf context not enabled at and below - // kEnableWait. - SetPerfLevel(kEnableWait); - perf_ctx->Reset(); - IngestExternalFileOptions opts; - opts.allow_global_seqno = true; - opts.allow_blocking_flush = true; - ASSERT_OK(db_->IngestExternalFile({file1}, opts)); - ASSERT_EQ(perf_context.file_ingestion_nanos, 0); - ASSERT_EQ(perf_context.file_ingestion_blocking_live_writes_nanos, 0); DestroyAndRecreateExternalSSTFilesDir(); } @@ -395,7 +347,8 @@ class ChecksumVerifyHelper { Status GetSingleFileChecksumAndFuncName( const std::string& file_path, std::string* file_checksum, - std::string* file_checksum_func_name) { + std::string* file_checksum_func_name, + const std::string& requested_func_name = {}) { Status s; EnvOptions soptions; std::unique_ptr file_reader; @@ -413,6 +366,8 @@ class ChecksumVerifyHelper { return Status::OK(); } else { FileChecksumGenContext gen_context; + gen_context.file_name = file_path; + gen_context.requested_checksum_func_name = requested_func_name; std::unique_ptr file_checksum_gen = file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context); *file_checksum_func_name = file_checksum_gen->Name(); @@ -488,10 +443,50 @@ TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) { DestroyAndRecreateExternalSSTFilesDir(); } +namespace { +class VariousFileChecksumGenerator : public FileChecksumGenCrc32c { + public: + explicit VariousFileChecksumGenerator(const std::string& name) + : FileChecksumGenCrc32c({}), name_(name) {} + + const char* Name() const override { return name_.c_str(); } + + std::string GetChecksum() const override { + return FileChecksumGenCrc32c::GetChecksum() + "_" + name_; + } + + private: + const std::string name_; +}; + +class VariousFileChecksumGenFactory : public FileChecksumGenFactory { + public: + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) override { + static RelaxedAtomic counter{0}; + if (Slice(context.requested_checksum_func_name).starts_with("Various")) { + return std::make_unique( + context.requested_checksum_func_name); + } else if (context.requested_checksum_func_name.empty()) { + // Lacking a specific request, use a different function name for each + // result. + return std::make_unique( + "Various" + std::to_string(counter.FetchAddRelaxed(1))); + } else { + return nullptr; + } + } + + static const char* kClassName() { return "VariousFileChecksumGenFactory"; } + const char* Name() const override { return kClassName(); } +}; +} // namespace + TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { Options old_options = CurrentOptions(); Options options = CurrentOptions(); - options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + options.file_checksum_gen_factory = + std::make_shared(); const ImmutableCFOptions ioptions(options); ChecksumVerifyHelper checksum_helper(options); @@ -512,7 +507,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_EQ(file1_info.largest_key, Key(1099)); std::string file_checksum1, file_checksum_func_name1; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( - file1, &file_checksum1, &file_checksum_func_name1)); + file1, &file_checksum1, &file_checksum_func_name1, + file1_info.file_checksum_func_name)); ASSERT_EQ(file1_info.file_checksum, file_checksum1); ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1); @@ -531,7 +527,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_EQ(file2_info.largest_key, Key(1299)); std::string file_checksum2, file_checksum_func_name2; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( - file2, &file_checksum2, &file_checksum_func_name2)); + file2, &file_checksum2, &file_checksum_func_name2, + file2_info.file_checksum_func_name)); ASSERT_EQ(file2_info.file_checksum, file_checksum2); ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2); @@ -550,7 +547,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_EQ(file3_info.largest_key, Key(1499)); std::string file_checksum3, file_checksum_func_name3; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( - file3, &file_checksum3, &file_checksum_func_name3)); + file3, &file_checksum3, &file_checksum_func_name3, + file3_info.file_checksum_func_name)); ASSERT_EQ(file3_info.file_checksum, file_checksum3); ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3); @@ -569,7 +567,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_EQ(file4_info.largest_key, Key(1799)); std::string file_checksum4, file_checksum_func_name4; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( - file4, &file_checksum4, &file_checksum_func_name4)); + file4, &file_checksum4, &file_checksum_func_name4, + file4_info.file_checksum_func_name)); ASSERT_EQ(file4_info.file_checksum, file_checksum4); ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4); @@ -588,7 +587,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_EQ(file5_info.largest_key, Key(1999)); std::string file_checksum5, file_checksum_func_name5; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( - file5, &file_checksum5, &file_checksum_func_name5)); + file5, &file_checksum5, &file_checksum_func_name5, + file5_info.file_checksum_func_name)); ASSERT_EQ(file5_info.file_checksum, file_checksum5); ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5); @@ -607,7 +607,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_EQ(file6_info.largest_key, Key(2199)); std::string file_checksum6, file_checksum_func_name6; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( - file6, &file_checksum6, &file_checksum_func_name6)); + file6, &file_checksum6, &file_checksum_func_name6, + file6_info.file_checksum_func_name)); ASSERT_EQ(file6_info.file_checksum, file_checksum6); ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6); @@ -677,18 +678,23 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { } ASSERT_OK(env_->FileExists(file2)); - // Enable verify_file_checksum option - // No checksum information is provided, generate it when ingesting - std::vector checksum, checksum_func; - s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false, - false, false); + // Enable verify_file_checksum option. No checksum information is provided, + // so it is generated when ingesting. The configured checksum factory will + // use a different function than before. + s = AddFileWithFileChecksum({file3}, {}, {}, true, false, false, false); ASSERT_OK(s) << s.ToString(); std::vector live_files2; dbfull()->GetLiveFilesMetaData(&live_files2); for (const auto& f : live_files2) { if (set1.find(f.name) == set1.end()) { - ASSERT_EQ(f.file_checksum, file_checksum3); - ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3); + // Recomputed checksum, different function + EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name3); + std::string cur_checksum3, cur_checksum_func_name3; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + dbname_ + f.name, &cur_checksum3, &cur_checksum_func_name3, + f.file_checksum_func_name)); + EXPECT_EQ(f.file_checksum, cur_checksum3); + EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name3); set1.insert(f.name); } } @@ -702,8 +708,9 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_NOK(s) << s.ToString(); // Does not enable verify_file_checksum options - // Checksum function name matches, store the checksum being ingested. - s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4}, + // Checksum function name is recognized, so store the checksum being ingested. + std::string file_checksum_func_name4alt = "VariousABCD"; + s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4alt}, false, false, false, false); ASSERT_OK(s) << s.ToString(); std::vector live_files3; @@ -712,7 +719,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { if (set1.find(f.name) == set1.end()) { ASSERT_FALSE(f.file_checksum == file_checksum4); ASSERT_EQ(f.file_checksum, "asd"); - ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4alt); set1.insert(f.name); } } @@ -721,7 +728,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { // enable verify_file_checksum options, DB enable checksum, and enable // write_global_seq. So the checksum stored is different from the one - // ingested due to the sequence number changes. + // ingested due to the sequence number changes. The checksum function name + // may also change since the checksum is recomputed. s = AddFileWithFileChecksum({file5}, {file_checksum5}, {file_checksum_func_name5}, true, false, false, true); @@ -730,11 +738,14 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { dbfull()->GetLiveFilesMetaData(&live_files4); for (const auto& f : live_files4) { if (set1.find(f.name) == set1.end()) { + // Recomputed checksum, different function + EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name5); std::string cur_checksum5, cur_checksum_func_name5; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( - dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5)); - ASSERT_EQ(f.file_checksum, cur_checksum5); - ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5); + dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5, + f.file_checksum_func_name)); + EXPECT_EQ(f.file_checksum, cur_checksum5); + EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name5); set1.insert(f.name); } } @@ -742,18 +753,22 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { ASSERT_OK(env_->FileExists(file5)); // Does not enable verify_file_checksum options and also the ingested file - // checksum information is empty. DB will generate and store the checksum - // in Manifest. - std::vector files_c6, files_name6; - s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false, - false, false); + // checksum information is empty. DB will generate and store file checksum + // in Manifest, which could be different from the previous invocation. + s = AddFileWithFileChecksum({file6}, {}, {}, false, false, false, false); ASSERT_OK(s) << s.ToString(); std::vector live_files6; dbfull()->GetLiveFilesMetaData(&live_files6); for (const auto& f : live_files6) { if (set1.find(f.name) == set1.end()) { - ASSERT_EQ(f.file_checksum, file_checksum6); - ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6); + // Recomputed checksum, different function + EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name6); + std::string cur_checksum6, cur_checksum_func_name6; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + dbname_ + f.name, &cur_checksum6, &cur_checksum_func_name6, + f.file_checksum_func_name)); + EXPECT_EQ(f.file_checksum, cur_checksum6); + EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name6); set1.insert(f.name); } } @@ -1954,21 +1969,44 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { SstFileWriter sst_file_writer(EnvOptions(), options); std::string file3 = sst_files_dir_ + "file3.sst"; ASSERT_OK(sst_file_writer.Open(file3)); - ASSERT_OK(sst_file_writer.Put("j", "j1")); + ASSERT_OK(sst_file_writer.Put("k", "k1")); ASSERT_OK(sst_file_writer.Put("m", "m1")); ExternalSstFileInfo file3_info; ASSERT_OK(sst_file_writer.Finish(&file3_info)); files.push_back(std::move(file3)); } + // This could be ingested to the same level as file3 and file4, but the + // greedy/simple overlap check relegates it to a later level + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + ASSERT_OK(sst_file_writer.Put("j", "j1")); + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + files.push_back(std::move(file4)); + } + + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file5 = sst_files_dir_ + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + ASSERT_OK(sst_file_writer.Put("i", "i3")); + ExternalSstFileInfo file5_info; + ASSERT_OK(sst_file_writer.Finish(&file5_info)); + files.push_back(std::move(file5)); + } + IngestExternalFileOptions ifo; ifo.allow_global_seqno = false; ASSERT_NOK(db_->IngestExternalFile(files, ifo)); ifo.allow_global_seqno = true; ASSERT_OK(db_->IngestExternalFile(files, ifo)); ASSERT_EQ(Get("a"), "a1"); - ASSERT_EQ(Get("i"), "i2"); + ASSERT_EQ(Get("i"), "i3"); ASSERT_EQ(Get("j"), "j1"); + ASSERT_EQ(Get("k"), "k1"); ASSERT_EQ(Get("m"), "m1"); int total_keys = 0; @@ -1979,10 +2017,11 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { } ASSERT_OK(iter->status()); delete iter; - ASSERT_EQ(total_keys, 4); + ASSERT_EQ(total_keys, 5); ASSERT_EQ(1, NumTableFilesAtLevel(6)); ASSERT_EQ(2, NumTableFilesAtLevel(5)); + ASSERT_EQ(2, NumTableFilesAtLevel(4)); } class CompactionJobStatsCheckerForFilteredFiles : public EventListener { @@ -2528,7 +2567,14 @@ TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) { options.default_write_temperature = Temperature::kHot; SstFileWriter sst_file_writer(EnvOptions(), options); options.level0_file_num_compaction_trigger = 2; - options.allow_ingest_behind = (mode == "ingest_behind"); + bool cf_option = Random::GetTLSInstance()->OneIn(2); + SCOPED_TRACE(std::string("Use ") + (cf_option ? "CF" : "DB") + + " option for ingest behind"); + if (cf_option) { + options.cf_allow_ingest_behind = (mode == "ingest_behind"); + } else { + options.allow_ingest_behind = (mode == "ingest_behind"); + } Reopen(options); Defer destroyer([&]() { Destroy(options); }); @@ -2669,51 +2715,358 @@ TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) { } } -TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) { +// This tests an internal user's exact usage and expectation of the +// IngestExternalFiles APIs to bulk load and replace files. +TEST_F(ExternalSSTFileBasicTest, + AtomicReplaceColumnFamilyWithIngestedVersionKey) { Options options = GetDefaultOptions(); - - std::string file_path = sst_files_dir_ + std::to_string(1); - SstFileWriter sfw(EnvOptions(), options); - - ASSERT_OK(sfw.Open(file_path)); - ASSERT_OK(sfw.Put("b", "dontcare")); - ASSERT_OK(sfw.Finish()); - - // Test universal compaction + ingest with snapshot consistency options.create_if_missing = true; options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + options.num_levels = 7; + options.disallow_memtable_writes = false; + DestroyAndReopen(options); - { - const Snapshot* snapshot = db_->GetSnapshot(); - ManagedSnapshot snapshot_guard(db_, snapshot); - IngestExternalFileOptions ifo; - ifo.fail_if_not_bottommost_level = true; - ifo.snapshot_consistency = true; - const Status s = db_->IngestExternalFile({file_path}, ifo); - ASSERT_TRUE(s.ok()); + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string data_file_original = sst_files_dir_ + "data_original"; + ASSERT_OK(sst_file_writer.Open(data_file_original)); + ASSERT_OK(sst_file_writer.Put("ukey1", "uval1_orig")); + ASSERT_OK(sst_file_writer.Put("ukey2", "uval2_orig")); + ASSERT_OK(sst_file_writer.Finish()); + ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), + {data_file_original}, + IngestExternalFileOptions())); + + ASSERT_OK(Put("data_version", "v_original")); + ASSERT_OK(Flush()); + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "data_version", &value)); + ASSERT_EQ(value, "v_original"); + ASSERT_OK(db_->Get(ReadOptions(), "ukey1", &value)); + ASSERT_EQ(value, "uval1_orig"); + ASSERT_OK(db_->Get(ReadOptions(), "ukey2", &value)); + ASSERT_EQ(value, "uval2_orig"); + // Set up a 1) data version key file on L0, and 2) a user data file on L6 + // to test the initial transitioning to use `atomic_replace_range`. + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + + // Test multiple cycles of replacing by atomically ingest a data file and a + // version key file while replace the whole range in the column family. + for (int i = 0; i < 10; i++) { + std::string version_file_path = + sst_files_dir_ + "version" + std::to_string(i); + ASSERT_OK(sst_file_writer.Open(version_file_path)); + ASSERT_OK(sst_file_writer.Put("data_version", "v" + std::to_string(i))); + ASSERT_OK(sst_file_writer.Finish()); + + std::string file_path = sst_files_dir_ + std::to_string(i); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("ukey1", "uval1" + std::to_string(i))); + ASSERT_OK(sst_file_writer.Put("ukey2", "uval2" + std::to_string(i))); + ASSERT_OK(sst_file_writer.Finish()); + + IngestExternalFileArg arg; + arg.column_family = db_->DefaultColumnFamily(); + arg.external_files = {version_file_path, file_path}; + arg.atomic_replace_range = {{nullptr, nullptr}}; + // Test both fail_if_not_bottomost_level: true and false + arg.options.fail_if_not_bottommost_level = i % 2 == 0; + arg.options.snapshot_consistency = false; + // Ingest 1) a new data version file and 2) a new user data file while erase + // the whole column family + Status s = db_->IngestExternalFiles({arg}); + ASSERT_OK(s); + + // Check ingestion result and the expected LSM shape: + // Two files on L6, 1) a data version file 2) a user data file. + ASSERT_OK(db_->Get(ReadOptions(), "ukey1", &value)); + ASSERT_EQ(value, "uval1" + std::to_string(i)); + ASSERT_OK(db_->Get(ReadOptions(), "ukey2", &value)); + ASSERT_EQ(value, "uval2" + std::to_string(i)); + ASSERT_OK(db_->Get(ReadOptions(), "data_version", &value)); + ASSERT_EQ(value, "v" + std::to_string(i)); + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); } - // Test level compaction - options.compaction_style = CompactionStyle::kCompactionStyleLevel; - options.num_levels = 2; - DestroyAndReopen(options); - ASSERT_OK(db_->Put(WriteOptions(), "a", "dontcare")); - ASSERT_OK(db_->Put(WriteOptions(), "c", "dontcare")); - ASSERT_OK(db_->Flush(FlushOptions())); + Close(); +} - ASSERT_OK(db_->Put(WriteOptions(), "b", "dontcare")); - ASSERT_OK(db_->Put(WriteOptions(), "d", "dontcare")); - ASSERT_OK(db_->Flush(FlushOptions())); +TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) { + for (bool disallow_memtable : {false, true}) { + Options options = GetDefaultOptions(); - { - CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // First test with universal compaction + options.create_if_missing = true; + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + DestroyAndReopen(options); - IngestExternalFileOptions ifo; - ifo.fail_if_not_bottommost_level = true; - const Status s = db_->IngestExternalFile({file_path}, ifo); - ASSERT_TRUE(s.IsTryAgain()); + // And a CF potentially disallowing memtable write + options.disallow_memtable_writes = disallow_memtable; + CreateColumnFamilies({"cf0"}, options); + ASSERT_EQ(db_->GetOptions(handles_[0]).disallow_memtable_writes, + disallow_memtable); + + // Ingest with snapshot consistency + std::string file_path = sst_files_dir_ + std::to_string(1); + std::string file_path2 = sst_files_dir_ + std::to_string(2); + SstFileWriter sfw(EnvOptions(), options); + + ASSERT_OK(sfw.Open(file_path)); + ASSERT_OK(sfw.Put("b", "0")); + ASSERT_OK(sfw.Finish()); + + { + const Snapshot* snapshot = db_->GetSnapshot(); + ManagedSnapshot snapshot_guard(db_.get(), snapshot); + IngestExternalFileOptions ifo; + ifo.fail_if_not_bottommost_level = true; + ifo.snapshot_consistency = true; + ASSERT_OK(db_->IngestExternalFile(handles_[0], {file_path}, ifo)); + } + ASSERT_EQ(Get(0, "b"), "0"); + + // Test level compaction + options.compaction_style = CompactionStyle::kCompactionStyleLevel; + options.num_levels = 2; + CreateColumnFamilies({"cf1"}, options); + ASSERT_EQ(db_->GetOptions(handles_[1]).disallow_memtable_writes, + disallow_memtable); + + if (!disallow_memtable) { + ASSERT_OK(Put(1, "a", "1")); + ASSERT_OK(Put(1, "c", "3")); + ASSERT_OK(Flush(1)); + + ASSERT_OK(Put(1, "b", "2")); + ASSERT_OK(Put(1, "d", "4")); + ASSERT_OK(Flush(1)); + } else { + // Memtable write disallowed + EXPECT_EQ(Put(1, "a", "1").code(), Status::Code::kInvalidArgument); + + // Use ingestion to get to the same state as above + ASSERT_OK(sfw.Open(file_path2)); + ASSERT_OK(sfw.Put("a", "1")); + ASSERT_OK(sfw.Put("c", "3")); + ASSERT_OK(sfw.Finish()); + ASSERT_OK(db_->IngestExternalFile(handles_[1], {file_path2}, {})); + + ASSERT_OK(sfw.Open(file_path2)); + ASSERT_OK(sfw.Put("b", "2")); + ASSERT_OK(sfw.Put("d", "4")); + ASSERT_OK(sfw.Finish()); + ASSERT_OK(db_->IngestExternalFile(handles_[1], {file_path2}, {})); + } + ASSERT_EQ(Get(1, "a"), "1"); + ASSERT_EQ(Get(1, "b"), "2"); + ASSERT_EQ(Get(1, "c"), "3"); + ASSERT_EQ(Get(1, "d"), "4"); + + { + // Test fail_if_not_bottommost_level, which fails if there's any overlap + // anywhere, even with snapshot_consistency=false + IngestExternalFileOptions ifo; + ASSERT_FALSE(ifo.fail_if_not_bottommost_level); + ifo.fail_if_not_bottommost_level = true; + ifo.snapshot_consistency = false; + // Fails with overlap on earlier level + Status s = db_->IngestExternalFile(handles_[1], {file_path}, ifo); + ASSERT_EQ(s.code(), Status::Code::kTryAgain); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr)); + + // Fails with overlap on last level + s = db_->IngestExternalFile(handles_[1], {file_path}, ifo); + ASSERT_EQ(s.code(), Status::Code::kTryAgain); + + // No change to data + ASSERT_EQ(Get(1, "a"), "1"); + ASSERT_EQ(Get(1, "b"), "2"); + ASSERT_EQ(Get(1, "c"), "3"); + ASSERT_EQ(Get(1, "d"), "4"); + } + + if (!disallow_memtable) { + // Test allow_blocking_flush=false (fail because of memtable overlap) + IngestExternalFileOptions ifo; + ASSERT_TRUE(ifo.allow_blocking_flush); + ifo.allow_blocking_flush = false; + ASSERT_OK(Put(1, "b", "42")); + Status s = db_->IngestExternalFile(handles_[1], {file_path}, ifo); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + ASSERT_EQ(Get(1, "a"), "1"); + ASSERT_EQ(Get(1, "b"), "42"); + ASSERT_EQ(Get(1, "c"), "3"); + ASSERT_EQ(Get(1, "d"), "4"); + + // Revert state + ASSERT_OK(Put(1, "b", "2")); + ASSERT_OK(Flush(1)); + } + + { + // Test atomic_replace_range + IngestExternalFileArg arg; + arg.column_family = handles_[1]; + arg.external_files = {file_path}; + arg.atomic_replace_range = {{"a", "zzz"}}; + + // start with some failure cases + // TODO: support snapshot consistency with tombstone file + ASSERT_TRUE(arg.options.snapshot_consistency); + Status s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kNotSupported); + + ASSERT_EQ(Get(1, "a"), "1"); + ASSERT_EQ(Get(1, "b"), "2"); + ASSERT_EQ(Get(1, "c"), "3"); + ASSERT_EQ(Get(1, "d"), "4"); + + arg.options.snapshot_consistency = false; + // Can usually be used with atomic_replace_range and + // snapshot_consistency=false, except it requires no input overlap + arg.options.fail_if_not_bottommost_level = true; + + // one-sided ranges not yet supported + arg.atomic_replace_range = {{{}, "zzz"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kNotSupported); + + arg.atomic_replace_range = {{"a", {}}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kNotSupported); + + // rejected because doesn't cover ingested file + arg.atomic_replace_range = {{"x", "z"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + // rejected because of partial file overlap + arg.atomic_replace_range = {{"a", "c"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + if (!disallow_memtable) { + // memtable overlap with replace range + ASSERT_OK(Put(1, "e", "5")); + arg.options.allow_blocking_flush = false; + + // rejected because of memtable overlap + arg.atomic_replace_range = {{"a", "z"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + // rejected because of memtable overlap + arg.atomic_replace_range = {{nullptr, nullptr}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + // FIXME: upper bound should be exclusive (DeleteRange semantics). + // currently rejected because of documented bug + arg.atomic_replace_range = {{"a", "e"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + // work-around ensuring no memtable overlap + arg.atomic_replace_range = {{"a", "d2"}}; + ASSERT_OK(db_->IngestExternalFiles({arg})); + + ASSERT_EQ(Get(1, "e"), "5"); + } else { + // rejected because of partial file overlap + arg.atomic_replace_range = {{"b", "z"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + // no memtable complications + arg.atomic_replace_range = {{"a", "z"}}; + ASSERT_OK(db_->IngestExternalFiles({arg})); + + ASSERT_EQ(Get(1, "e"), "NOT_FOUND"); + } + ASSERT_EQ(Get(1, "a"), "NOT_FOUND"); + ASSERT_EQ(Get(1, "b"), "0"); + ASSERT_EQ(Get(1, "c"), "NOT_FOUND"); + ASSERT_EQ(Get(1, "d"), "NOT_FOUND"); + + // The single ingested file replaced everything (except perhaps memtable) + std::vector live_files; + db_->GetLiveFilesMetaData(&live_files); + // One file in each CF + ASSERT_EQ(live_files.size(), 2); + + ASSERT_OK(sfw.Open(file_path)); + ASSERT_OK(sfw.Put("f", "6")); + ASSERT_OK(sfw.Finish()); + + // Another file + ASSERT_OK(sfw.Open(file_path2)); + ASSERT_OK(sfw.Put("f", "7")); + ASSERT_OK(sfw.Put("g", "8")); + ASSERT_OK(sfw.Finish()); + + if (!disallow_memtable) { + // rejected because of memtable overlap with range + arg.atomic_replace_range = {{"e", "z"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + // allow blocking flush of "e" (which is then replaced), and the file + // with just "b" is not replaced + arg.options.allow_blocking_flush = true; + ASSERT_OK(db_->IngestExternalFiles({arg})); + + ASSERT_EQ(Get(1, "b"), "0"); + ASSERT_EQ(Get(1, "e"), "NOT_FOUND"); + ASSERT_EQ(Get(1, "f"), "6"); + ASSERT_EQ(Get(1, "g"), "NOT_FOUND"); + + // memtable overlap with replace range + ASSERT_OK(Put(1, "e", "5")); + arg.options.allow_blocking_flush = false; + arg.external_files = {file_path2}; + + // rejected because of memtable overlap + arg.atomic_replace_range = {{nullptr, nullptr}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + // Replace everything, including with memtable flush + arg.options.allow_blocking_flush = true; + ASSERT_OK(db_->IngestExternalFiles({arg})); + + ASSERT_EQ(Get(1, "b"), "NOT_FOUND"); + ASSERT_EQ(Get(1, "e"), "NOT_FOUND"); + ASSERT_EQ(Get(1, "f"), "7"); + ASSERT_EQ(Get(1, "g"), "8"); + } else { + arg.external_files = {file_path2, file_path}; + + // rejected because of overlap in files to ingest with fail_if_ = true + arg.atomic_replace_range = {{"e", "z"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kTryAgain); + + arg.options.fail_if_not_bottommost_level = false; + + // rejected because range doesn't cover ingested files + // FIXME: upper bound should be exclusive "g" instead + arg.atomic_replace_range = {{"e", "f2"}}; + s = db_->IngestExternalFiles({arg}); + ASSERT_EQ(s.code(), Status::Code::kInvalidArgument); + + // Loaded into different levels, and the file with just "b" is not + // replaced + arg.atomic_replace_range = {{"e", "z"}}; + ASSERT_OK(db_->IngestExternalFiles({arg})); + + ASSERT_EQ(Get(1, "b"), "0"); + ASSERT_EQ(Get(1, "f"), "6"); // earlier file listed later to ingest + ASSERT_EQ(Get(1, "g"), "8"); + } + } } } diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index a439189afa7e..7a379b9df790 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -11,6 +11,7 @@ #include #include +#include "db/builder.h" #include "db/db_impl/db_impl.h" #include "db/version_edit.h" #include "file/file_util.h" @@ -29,6 +30,7 @@ Status ExternalSstFileIngestionJob::Prepare( const std::vector& external_files_paths, const std::vector& files_checksums, const std::vector& files_checksum_func_names, + const std::optional& atomic_replace_range, const Temperature& file_temperature, uint64_t next_file_number, SuperVersion* sv) { Status status; @@ -41,6 +43,9 @@ Status ExternalSstFileIngestionJob::Prepare( status = GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv); if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get ingested file info: %s: %s", + file_path.c_str(), status.ToString().c_str()); return status; } @@ -80,33 +85,69 @@ Status ExternalSstFileIngestionJob::Prepare( std::sort(sorted_files.begin(), sorted_files.end(), file_range_checker_); for (size_t i = 0; i + 1 < num_files; i++) { - if (file_range_checker_.OverlapsWithPrev(sorted_files[i], - sorted_files[i + 1], - /* ranges_sorted= */ true)) { + if (file_range_checker_.Overlaps(*sorted_files[i], *sorted_files[i + 1], + /* known_sorted= */ true)) { files_overlap_ = true; break; } } } - if (ingestion_options_.ingest_behind && files_overlap_) { - return Status::NotSupported( - "Files with overlapping ranges cannot be ingested with ingestion " - "behind mode."); + if (atomic_replace_range.has_value()) { + atomic_replace_range_.emplace(); + + if (atomic_replace_range->start && atomic_replace_range->limit) { + // User keys to internal keys (with timestamps) + const size_t ts_sz = ucmp_->timestamp_size(); + std::string start_with_ts, limit_with_ts; + auto [start, limit] = MaybeAddTimestampsToRange( + atomic_replace_range->start, atomic_replace_range->limit, ts_sz, + &start_with_ts, &limit_with_ts); + assert(start.has_value()); + assert(limit.has_value()); + atomic_replace_range_->smallest_internal_key.Set( + *start, kMaxSequenceNumber, kValueTypeForSeek); + atomic_replace_range_->largest_internal_key.Set( + *limit, kMaxSequenceNumber, kValueTypeForSeek); + // Check files to ingest against replace range + for (size_t i = 0; i < num_files; i++) { + if (!file_range_checker_.Contains(*atomic_replace_range_, + files_to_ingest_[i])) { + return Status::InvalidArgument( + "Atomic replace range does not contain all files"); + } + } + } else { + // Currently if either bound is not present, both must be + assert(atomic_replace_range->start.has_value() == false); + assert(atomic_replace_range->limit.has_value() == false); + assert(atomic_replace_range_->smallest_internal_key.unset()); + assert(atomic_replace_range_->largest_internal_key.unset()); + } } - // Overlapping files need at least two different sequence numbers. If settings - // disables global seqno, ingestion will fail anyway, so fail fast in prepare. - if (!ingestion_options_.allow_global_seqno && files_overlap_) { - return Status::InvalidArgument( - "Global seqno is required, but disabled (because external files key " - "range overlaps)."); - } + if (files_overlap_) { + if (ingestion_options_.ingest_behind) { + return Status::NotSupported( + "Files with overlapping ranges cannot be ingested with ingestion " + "behind mode."); + } - if (ucmp_->timestamp_size() > 0 && files_overlap_) { - return Status::NotSupported( - "Files with overlapping ranges cannot be ingested to column " - "family with user-defined timestamp enabled."); + // Overlapping files need at least two different sequence numbers. If + // settings disables global seqno, ingestion will fail anyway, so fail + // fast in prepare. + if (!ingestion_options_.allow_global_seqno && + !ingestion_options_.allow_db_generated_files) { + return Status::InvalidArgument( + "Global seqno is required, but disabled (because external files key " + "range overlaps)."); + } + + if (ucmp_->timestamp_size() > 0) { + return Status::NotSupported( + "Files with overlapping ranges cannot be ingested to column " + "family with user-defined timestamp enabled."); + } } // Copy/Move external files into DB @@ -123,6 +164,14 @@ Status ExternalSstFileIngestionJob::Prepare( // It is unsafe to assume application had sync the file and file // directory before ingest the file. For integrity of RocksDB we need // to sync the file. + + // TODO(xingbo), We should in general be moving away from production + // uses of ReuseWritableFile (except explicitly for WAL recycling), + // ReopenWritableFile, and NewRandomRWFile. We should create a + // FileSystem::SyncFile/FsyncFile API that by default does the + // re-open+sync+close combo but can (a) be reused easily, and (b) be + // overridden to do that more cleanly, e.g. in EncryptedEnv. + // https://github.com/facebook/rocksdb/issues/13741 std::unique_ptr file_to_sync; Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, &file_to_sync, nullptr); @@ -153,6 +202,10 @@ Status ExternalSstFileIngestionJob::Prepare( ROCKS_LOG_INFO(db_options_.info_log, "Tried to link file %s but it's not supported : %s", path_outside_db.c_str(), status.ToString().c_str()); + } else { + ROCKS_LOG_WARN(db_options_.info_log, "Failed to link file %s to %s: %s", + path_outside_db.c_str(), path_inside_db.c_str(), + status.ToString().c_str()); } } else { f.copy_file = true; @@ -177,6 +230,12 @@ Status ExternalSstFileIngestionJob::Prepare( io_tracer_); // The destination of the copy will be ingested f.file_temperature = dst_temp; + + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Failed to copy file %s to %s: %s", + path_outside_db.c_str(), path_inside_db.c_str(), + status.ToString().c_str()); + } } else { // Note: we currently assume that linking files does not cross // temperatures, so no need to change f.file_temperature @@ -227,10 +286,6 @@ Status ExternalSstFileIngestionJob::Prepare( } else { need_generate_file_checksum_ = true; } - FileChecksumGenContext gen_context; - std::unique_ptr file_checksum_gen = - db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator( - gen_context); std::vector generated_checksums; std::vector generated_checksum_func_names; // Step 1: generate the checksum for ingested sst file. @@ -238,11 +293,25 @@ Status ExternalSstFileIngestionJob::Prepare( for (size_t i = 0; i < files_to_ingest_.size(); i++) { std::string generated_checksum; std::string generated_checksum_func_name; - std::string requested_checksum_func_name; + std::string requested_checksum_func_name = + i < files_checksum_func_names.size() ? files_checksum_func_names[i] + : ""; // TODO: rate limit file reads for checksum calculation during file // ingestion. // TODO: plumb Env::IOActivity ReadOptions ro; + // Pass user-provided checksums through FileOptions when available. + // The caller may not have provided checksums at all (empty vectors), + // so we guard with a bounds check. + FileOptions fopts; + if (i < files_checksums.size()) { + fopts.file_checksum = files_checksums[i]; + } + if (i < files_checksum_func_names.size()) { + fopts.file_checksum_func_name = files_checksum_func_names[i]; + } else { + fopts.file_checksum_func_name = kNoFileChecksumFuncName; + } IOStatus io_s = GenerateOneFileChecksum( fs_.get(), files_to_ingest_[i].internal_file_path, db_options_.file_checksum_gen_factory.get(), @@ -251,7 +320,7 @@ Status ExternalSstFileIngestionJob::Prepare( ingestion_options_.verify_checksums_readahead_size, db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(), ro, db_options_.stats, - db_options_.clock); + db_options_.clock, fopts); if (!io_s.ok()) { status = io_s; ROCKS_LOG_WARN(db_options_.info_log, @@ -281,40 +350,50 @@ Status ExternalSstFileIngestionJob::Prepare( if (files_checksum_func_names[i] != generated_checksum_func_names[i]) { status = Status::InvalidArgument( - "Checksum function name does not match with the checksum " - "function name of this DB"); - ROCKS_LOG_WARN( - db_options_.info_log, - "Sst file checksum verification of file: %s failed: %s", - external_files_paths[i].c_str(), status.ToString().c_str()); + "DB file checksum gen factory " + + std::string(db_options_.file_checksum_gen_factory->Name()) + + " generated checksum function name " + + generated_checksum_func_names[i] + " for file " + + external_files_paths[i] + + " which does not match requested/provided " + + files_checksum_func_names[i]); break; } if (files_checksums[i] != generated_checksums[i]) { status = Status::Corruption( - "Ingested checksum does not match with the generated " - "checksum"); - ROCKS_LOG_WARN( - db_options_.info_log, - "Sst file checksum verification of file: %s failed: %s", - files_to_ingest_[i].internal_file_path.c_str(), - status.ToString().c_str()); + "Checksum verification mismatch for ingestion file " + + external_files_paths[i] + " using function " + + generated_checksum_func_names[i] + ". Expected: " + + Slice(files_checksums[i]).ToString(/*hex=*/true) + + " Computed: " + + Slice(generated_checksums[i]).ToString(/*hex=*/true)); break; } } } else { - // If verify_file_checksum is not enabled, we only verify the - // checksum function name. If it does not match, fail the ingestion. - // If matches, we trust the ingested checksum information and store - // in the Manifest. + // If verify_file_checksum is not enabled, we only verify the factory + // recognizes the checksum function name. If it does not match, fail + // the ingestion. If matches, we trust the ingested checksum + // information and store in the Manifest. for (size_t i = 0; i < files_to_ingest_.size(); i++) { - if (files_checksum_func_names[i] != file_checksum_gen->Name()) { + FileChecksumGenContext gen_context; + gen_context.file_name = files_to_ingest_[i].internal_file_path; + gen_context.requested_checksum_func_name = + files_checksum_func_names[i]; + auto file_checksum_gen = + db_options_.file_checksum_gen_factory + ->CreateFileChecksumGenerator(gen_context); + + if (file_checksum_gen == nullptr || + files_checksum_func_names[i] != file_checksum_gen->Name()) { status = Status::InvalidArgument( - "Checksum function name does not match with the checksum " - "function name of this DB"); - ROCKS_LOG_WARN( - db_options_.info_log, - "Sst file checksum verification of file: %s failed: %s", - external_files_paths[i].c_str(), status.ToString().c_str()); + "Checksum function name " + files_checksum_func_names[i] + + " for file " + external_files_paths[i] + + " not recognized by DB checksum gen factory" + + db_options_.file_checksum_gen_factory->Name() + + (file_checksum_gen ? (" Returned function " + + std::string(file_checksum_gen->Name())) + : "")); break; } files_to_ingest_[i].file_checksum = files_checksums[i]; @@ -329,12 +408,11 @@ Status ExternalSstFileIngestionJob::Prepare( status = Status::InvalidArgument( "The checksum information of ingested sst files are nonempty and " "the size of checksums or the size of the checksum function " - "names " - "does not match with the number of ingested sst files"); - ROCKS_LOG_WARN( - db_options_.info_log, - "The ingested sst files checksum information is incomplete: %s", - status.ToString().c_str()); + "names does not match with the number of ingested sst files"); + } + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Ingestion failed: %s", + status.ToString().c_str()); } } } @@ -359,9 +437,9 @@ void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() { file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true); for (auto& file : files_to_ingest_) { - if (file_range_checker_.OverlapsWithPrev(&file_batches_to_ingest_.back(), - &file, - /* ranges_sorted= */ false)) { + if (!file_batches_to_ingest_.back().unset() && + file_range_checker_.Overlaps(file_batches_to_ingest_.back(), file, + /* known_sorted= */ false)) { file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true); } file_batches_to_ingest_.back().AddFile(&file, file_range_checker_); @@ -370,14 +448,37 @@ void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() { Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, SuperVersion* super_version) { - size_t n = files_to_ingest_.size(); - autovector ranges; - ranges.reserve(n); - for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { - ranges.emplace_back(file_to_ingest.start_ukey, file_to_ingest.limit_ukey); - } - Status status = cfd_->RangesOverlapWithMemtables( - ranges, super_version, db_options_.allow_data_in_errors, flush_needed); + Status status; + if (atomic_replace_range_.has_value() && atomic_replace_range_->unset()) { + // For replacing whole CF, we can simply check whether memtable is empty + *flush_needed = !super_version->mem->IsEmpty(); + } else { + autovector ranges; + if (atomic_replace_range_.has_value()) { + assert(!atomic_replace_range_->smallest_internal_key.unset()); + assert(!atomic_replace_range_->largest_internal_key.unset()); + // NOTE: we already checked in Prepare() that the atomic_replace_range + // covers all the files_to_ingest + // FIXME: need to make upper bound key exclusive (not easy here because + // the existing internal APIs deal in inclusive upper bound user keys) + ranges.emplace_back( + atomic_replace_range_->smallest_internal_key.user_key(), + atomic_replace_range_->largest_internal_key.user_key()); + } else { + ranges.reserve(files_to_ingest_.size()); + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + ranges.emplace_back(file_to_ingest.start_ukey, + file_to_ingest.limit_ukey); + } + } + status = cfd_->RangesOverlapWithMemtables( + ranges, super_version, db_options_.allow_data_in_errors, flush_needed); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to check ranges overlap with memtables: %s", + status.ToString().c_str()); + } + } if (status.ok() && *flush_needed) { if (!ingestion_options_.allow_blocking_flush) { status = Status::InvalidArgument("External file requires flush"); @@ -411,6 +512,9 @@ Status ExternalSstFileIngestionJob::Run() { bool need_flush = false; status = NeedsFlush(&need_flush, super_version); if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to check if flush is needed: %s", + status.ToString().c_str()); return status; } if (need_flush) { @@ -430,15 +534,61 @@ Status ExternalSstFileIngestionJob::Run() { // the only active writer, and hence they are equal SequenceNumber last_seqno = versions_->LastSequence(); edit_.SetColumnFamily(cfd_->GetID()); - // The levels that the files will be ingested into + if (atomic_replace_range_.has_value()) { + auto* vstorage = super_version->current->storage_info(); + if (atomic_replace_range_->unset()) { + if (cfd_->compaction_picker()->IsCompactionInProgress()) { + return Status::InvalidArgument( + "Atomic replace range (full) overlaps with pending compaction"); + } + for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { + for (auto file : vstorage->LevelFiles(lvl)) { + // Set up to delete file to be replaced + edit_.DeleteFile(lvl, file->fd.GetNumber()); + } + } + } else { + assert(!atomic_replace_range_->smallest_internal_key.unset()); + assert(!atomic_replace_range_->largest_internal_key.unset()); + for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { + if (cfd_->RangeOverlapWithCompaction( + atomic_replace_range_->smallest_internal_key.user_key(), + atomic_replace_range_->largest_internal_key.user_key(), lvl)) { + return Status::InvalidArgument( + "Atomic replace range overlaps with pending compaction"); + } + for (auto file : vstorage->LevelFiles(lvl)) { + if (file_range_checker_.Overlaps(*atomic_replace_range_, + file->smallest, file->largest)) { + if (file_range_checker_.Contains(*atomic_replace_range_, + file->smallest, file->largest)) { + // Set up to delete file to be replaced + edit_.DeleteFile(lvl, file->fd.GetNumber()); + } else { + // TODO: generate and ingest a tombstone file also + return Status::InvalidArgument( + "Atomic replace range partially overlaps with existing file"); + } + } + } + } + } + } + + // Find levels to ingest into std::optional prev_batch_uppermost_level; + // batches at the front of file_batches_to_ingest_ contains older updates and + // are placed in smaller levels. for (auto& batch : file_batches_to_ingest_) { int batch_uppermost_level = 0; status = AssignLevelsForOneBatch(batch, super_version, force_global_seqno, &last_seqno, &batch_uppermost_level, prev_batch_uppermost_level); if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to assign levels for one batch: %s", + status.ToString().c_str()); return status; } @@ -481,8 +631,19 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch( &largest_parsed, false /* log_err_key */); } if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Failed to parse internal key: %s", + status.ToString().c_str()); return status; } + + // If any ingested file overlaps with the DB, it will fail here. + if (ingestion_options_.allow_db_generated_files && assigned_seqno != 0) { + return Status::InvalidArgument( + "An ingested file overlaps with existing data in the DB and has been " + "assigned a non-zero sequence number, which is not allowed when " + "'allow_db_generated_files' is enabled."); + } + if (smallest_parsed.sequence == 0 && assigned_seqno != 0) { UpdateInternalKey(file->smallest_internal_key.rep(), assigned_seqno, smallest_parsed.type); @@ -494,6 +655,10 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch( status = AssignGlobalSeqnoForIngestedFile(file, assigned_seqno); if (!status.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Failed to assign global sequence number for ingested file: %s", + status.ToString().c_str()); return status; } TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", @@ -501,11 +666,14 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch( assert(assigned_seqno == 0 || assigned_seqno == *last_seqno + 1); if (assigned_seqno > *last_seqno) { *last_seqno = assigned_seqno; - ++consumed_seqno_count_; } + max_assigned_seqno_ = std::max(max_assigned_seqno_, assigned_seqno); status = GenerateChecksumForIngestedFile(file); if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to generate checksum for ingested file: %s", + status.ToString().c_str()); return status; } @@ -518,34 +686,39 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch( current_time = oldest_ancester_time = static_cast(temp_current_time); } - uint64_t tail_size = 0; - bool contain_no_data_blocks = file->table_properties.num_entries > 0 && - (file->table_properties.num_entries == - file->table_properties.num_range_deletions); - if (file->table_properties.tail_start_offset > 0 || - contain_no_data_blocks) { - uint64_t file_size = file->fd.GetFileSize(); - assert(file->table_properties.tail_start_offset <= file_size); - tail_size = file_size - file->table_properties.tail_start_offset; - } + uint64_t tail_size = FileMetaData::CalculateTailSize( + file->fd.GetFileSize(), file->table_properties); bool marked_for_compaction = file->table_properties.num_range_deletions == 1 && (file->table_properties.num_entries == file->table_properties.num_range_deletions); + SequenceNumber smallest_seqno = file->assigned_seqno; + SequenceNumber largest_seqno = file->assigned_seqno; + if (ingestion_options_.allow_db_generated_files) { + assert(file->assigned_seqno == 0); + assert(file->smallest_seqno != kMaxSequenceNumber); + assert(file->largest_seqno != kMaxSequenceNumber); + smallest_seqno = file->smallest_seqno; + largest_seqno = file->largest_seqno; + max_assigned_seqno_ = std::max(max_assigned_seqno_, file->largest_seqno); + } FileMetaData f_metadata( file->fd.GetNumber(), file->fd.GetPathId(), file->fd.GetFileSize(), - file->smallest_internal_key, file->largest_internal_key, - file->assigned_seqno, file->assigned_seqno, false, - file->file_temperature, kInvalidBlobFileNumber, oldest_ancester_time, - current_time, + file->smallest_internal_key, file->largest_internal_key, smallest_seqno, + largest_seqno, false, file->file_temperature, kInvalidBlobFileNumber, + oldest_ancester_time, current_time, ingestion_options_.ingest_behind ? kReservedEpochNumberForFileIngestedBehind - : cfd_->NewEpochNumber(), + : cfd_->NewEpochNumber(), // orders files ingested to L0 file->file_checksum, file->file_checksum_func_name, file->unique_id, 0, - tail_size, file->user_defined_timestamps_persisted); + tail_size, file->user_defined_timestamps_persisted, "", ""); f_metadata.temperature = file->file_temperature; f_metadata.marked_for_compaction = marked_for_compaction; + // Extract min/max timestamps from table properties for UDT support. + // This ensures ingested files have proper timestamp ranges in FileMetaData, + // similar to files created by flush and compaction. + ExtractTimestampFromTableProperties(file->table_properties, &f_metadata); edit_.AddFile(file->picked_level, f_metadata); *batch_uppermost_level = @@ -593,15 +766,13 @@ void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() { cfd_->ioptions().compaction_style), LLONG_MAX /* max compaction bytes, not applicable */, 0 /* output path ID, not applicable */, mutable_cf_options.compression, - mutable_cf_options.compression_opts, - mutable_cf_options.default_write_temperature, + mutable_cf_options.compression_opts, Temperature::kUnknown, 0 /* max_subcompaction, not applicable */, {} /* grandparents, not applicable */, std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */, - false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */, - false /* is deletion compaction, not applicable */, - files_overlap_ /* l0_files_might_overlap, not applicable */, - CompactionReason::kExternalSstIngestion)); + CompactionReason::kExternalSstIngestion, "" /* trim_ts */, + -1 /* score, not applicable */, + files_overlap_ /* l0_files_might_overlap, not applicable */)); } } @@ -689,7 +860,6 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) { // We failed to add the files to the database // remove all the files we copied DeleteInternalFiles(); - consumed_seqno_count_ = 0; files_overlap_ = false; } else if (status.ok() && ingestion_options_.move_files) { // The files were moved and added successfully, remove original file links @@ -732,6 +902,10 @@ Status ExternalSstFileIngestionJob::ResetTableReader( Status status = fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr); if (!status.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Failed to create random access file for external file %s: %s", + external_file.c_str(), status.ToString().c_str()); return status; } Temperature updated_temp = sst_file->GetTemperature(); @@ -750,7 +924,8 @@ Status ExternalSstFileIngestionJob::ResetTableReader( ro, TableReaderOptions( cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, - env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.compression_manager.get(), env_options_, + cfd_->internal_comparator(), sv->mutable_cf_options.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, /*force_direct_prefetch*/ false, /*level*/ -1, @@ -853,6 +1028,10 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties( // user_defined_timestamps_persisted flag for the file. file_to_ingest->user_defined_timestamps_persisted = false; } else if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "ValidateUserDefinedTimestampsOptions failed for external file %s: %s", + external_file.c_str(), s.ToString().c_str()); return s; } @@ -877,6 +1056,9 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( Status status = fs_->GetFileSize(external_file, IOOptions(), &file_to_ingest->file_size, nullptr); if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get file size for external file %s: %s", + external_file.c_str(), status.ToString().c_str()); return status; } @@ -893,15 +1075,52 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( /*user_defined_timestamps_persisted=*/true, sv, file_to_ingest, &table_reader); if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to reset table reader for external file %s: %s", + external_file.c_str(), status.ToString().c_str()); return status; } status = SanityCheckTableProperties(external_file, new_file_number, sv, file_to_ingest, &table_reader); if (!status.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Failed to sanity check table properties for external file %s: %s", + external_file.c_str(), status.ToString().c_str()); return status; } + const bool allow_data_in_errors = db_options_.allow_data_in_errors; + ParsedInternalKey key; + if (ingestion_options_.allow_db_generated_files) { + // We are ingesting a DB generated SST file for which we don't reassign + // sequence numbers. We need its smallest sequence number and largest + // sequence number for FileMetaData. + Status seqno_status = GetSeqnoBoundaryForFile( + table_reader.get(), sv, file_to_ingest, allow_data_in_errors); + + if (!seqno_status.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Failed to get sequence number boundary for external file %s: %s", + external_file.c_str(), seqno_status.ToString().c_str()); + return seqno_status; + } + assert(file_to_ingest->smallest_seqno <= file_to_ingest->largest_seqno); + assert(file_to_ingest->largest_seqno < kMaxSequenceNumber); + } else { + SequenceNumber largest_seqno = + table_reader.get()->GetTableProperties()->key_largest_seqno; + // UINT64_MAX means unknown and the file is generated before table property + // `key_largest_seqno` is introduced. + if (largest_seqno != UINT64_MAX && largest_seqno > 0) { + return Status::Corruption( + "External file has non zero largest sequence number " + + std::to_string(largest_seqno)); + } + } + if (ingestion_options_.verify_checksums_before_ingest) { // If customized readahead size is needed, we can pass a user option // all the way to here. Right now we just rely on the default readahead @@ -913,11 +1132,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( status = table_reader->VerifyChecksum( ro, TableReaderCaller::kExternalSSTIngestion); if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to verify checksum for table reader: %s", + status.ToString().c_str()); return status; } } - ParsedInternalKey key; // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.fill_cache = ingestion_options_.fill_cache; @@ -926,7 +1147,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); // Get first (smallest) and last (largest) key from file. - bool allow_data_in_errors = db_options_.allow_data_in_errors; iter->SeekToFirst(); if (iter->Valid()) { Status pik_status = @@ -935,7 +1155,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( return Status::Corruption("Corrupted key in external file. ", pik_status.getState()); } - if (key.sequence != 0) { + if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) { return Status::Corruption("External file has non zero sequence number"); } file_to_ingest->smallest_internal_key.SetFrom(key); @@ -972,41 +1192,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( return Status::Corruption("Corrupted key in external file. ", pik_status.getState()); } - if (key.sequence != 0) { + if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) { return Status::Corruption("External file has non zero sequence number"); } file_to_ingest->largest_internal_key.SetFrom(key); } else if (!iter->status().ok()) { return iter->status(); } - SequenceNumber largest_seqno = - table_reader.get()->GetTableProperties()->key_largest_seqno; - // UINT64_MAX means unknown and the file is generated before table property - // `key_largest_seqno` is introduced. - if (largest_seqno != UINT64_MAX && largest_seqno > 0) { - return Status::Corruption( - "External file has non zero largest sequence number " + - std::to_string(largest_seqno)); - } - if (ingestion_options_.allow_db_generated_files && - largest_seqno == UINT64_MAX) { - // Need to verify that all keys have seqno zero. - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - Status pik_status = - ParseInternalKey(iter->key(), &key, allow_data_in_errors); - if (!pik_status.ok()) { - return Status::Corruption("Corrupted key in external file. ", - pik_status.getState()); - } - if (key.sequence != 0) { - return Status::NotSupported( - "External file has a key with non zero sequence number."); - } - } - if (!iter->status().ok()) { - return iter->status(); - } - } std::unique_ptr range_del_iter( table_reader->NewRangeTombstoneIterator(ro)); @@ -1021,7 +1213,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( return Status::Corruption("Corrupted key in external file. ", pik_status.getState()); } - if (key.sequence != 0) { + if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) { return Status::Corruption( "External file has a range deletion with non zero sequence " "number."); @@ -1069,12 +1261,14 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( const size_t ts_sz = ucmp_->timestamp_size(); assert(!prev_batch_uppermost_level.has_value() || prev_batch_uppermost_level.value() < cfd_->NumberLevels()); - bool must_assign_to_l0 = prev_batch_uppermost_level.has_value() && - prev_batch_uppermost_level.value() == 0; - if (force_global_seqno || files_overlap_ || - compaction_style == kCompactionStyleFIFO || must_assign_to_l0) { + bool must_assign_to_l0 = (prev_batch_uppermost_level.has_value() && + prev_batch_uppermost_level.value() == 0) || + compaction_style == kCompactionStyleFIFO; + + if (force_global_seqno || (!ingestion_options_.allow_db_generated_files && + (files_overlap_ || must_assign_to_l0))) { *assigned_seqno = last_seqno + 1; - if (compaction_style == kCompactionStyleFIFO || must_assign_to_l0) { + if (must_assign_to_l0) { assert(ts_sz == 0); file_to_ingest->picked_level = 0; if (ingestion_options_.fail_if_not_bottommost_level && @@ -1095,15 +1289,29 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( ro.total_order_seek = true; int target_level = 0; auto* vstorage = cfd_->current()->storage_info(); - assert(!must_assign_to_l0); - int exclusive_end_level = prev_batch_uppermost_level.has_value() - ? prev_batch_uppermost_level.value() - : cfd_->NumberLevels(); + assert(!must_assign_to_l0 || ingestion_options_.allow_db_generated_files); + int assigned_level_exclusive_end = cfd_->NumberLevels(); + if (must_assign_to_l0) { + assigned_level_exclusive_end = 0; + } else if (prev_batch_uppermost_level.has_value()) { + assigned_level_exclusive_end = prev_batch_uppermost_level.value(); + } - for (int lvl = 0; lvl < exclusive_end_level; lvl++) { + // When ingesting db generated files, we require that ingested files do not + // overlap with any file in the DB. So we need to check all levels. + int overlap_checking_exclusive_end = + ingestion_options_.allow_db_generated_files + ? cfd_->NumberLevels() + : assigned_level_exclusive_end; + for (int lvl = 0; lvl < overlap_checking_exclusive_end; lvl++) { if (lvl > 0 && lvl < vstorage->base_level()) { continue; } + if (lvl < assigned_level_exclusive_end && + atomic_replace_range_.has_value()) { + target_level = lvl; + continue; + } if (cfd_->RangeOverlapWithCompaction(file_to_ingest->start_ukey, file_to_ingest->limit_ukey, lvl)) { // We must use L0 or any level higher than `lvl` to be able to overwrite @@ -1118,6 +1326,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( ro, env_options_, file_to_ingest->start_ukey, file_to_ingest->limit_ukey, lvl, &overlap_with_level); if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to check overlap with level iterator: %s", + status.ToString().c_str()); return status; } if (overlap_with_level) { @@ -1131,7 +1342,8 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( // We don't overlap with any keys in this level, but we still need to check // if our file can fit in it - if (IngestedFileFitInLevel(file_to_ingest, lvl)) { + if (lvl < assigned_level_exclusive_end && + IngestedFileFitInLevel(file_to_ingest, lvl)) { target_level = lvl; } } @@ -1140,8 +1352,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( target_level < cfd_->NumberLevels() - 1) { status = Status::TryAgain( "Files cannot be ingested to Lmax. Please make sure key range of Lmax " - "and ongoing compaction's output to Lmax" - "does not overlap with files to ingest."); + "and ongoing compaction's output to Lmax does not overlap with files " + "to ingest. Input files overlapping with each other can cause some " + "file to be assigned to non Lmax level."); return status; } @@ -1162,16 +1375,13 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( } } - if (ingestion_options_.allow_db_generated_files && *assigned_seqno != 0) { - return Status::InvalidArgument( - "An ingested file is assigned to a non-zero sequence number, which is " - "incompatible with ingestion option allow_db_generated_files."); - } return status; } Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( IngestedFileInfo* file_to_ingest) { + assert(!atomic_replace_range_.has_value()); + auto* vstorage = cfd_->current()->storage_info(); // First, check if new files fit in the last level int last_lvl = cfd_->NumberLevels() - 1; @@ -1181,13 +1391,13 @@ Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( "at the last level!"); } - // Second, check if despite allow_ingest_behind=true we still have 0 seqnums - // at some upper level + // Second, check if despite cf_allow_ingest_behind=true we still have 0 + // seqnums at some upper level for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) { for (auto file : vstorage->LevelFiles(lvl)) { if (file->fd.smallest_seqno == 0) { return Status::InvalidArgument( - "Can't ingest_behind file as despite allow_ingest_behind=true " + "Can't ingest_behind file as despite cf_allow_ingest_behind=true " "there are files with 0 seqno in database at upper levels!"); } } @@ -1199,8 +1409,12 @@ Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( IngestedFileInfo* file_to_ingest, SequenceNumber seqno) { + if (ingestion_options_.allow_db_generated_files) { + assert(seqno == 0); + assert(file_to_ingest->original_seqno == 0); + } if (file_to_ingest->original_seqno == seqno) { - // This file already have the correct global seqno + // This file already has the correct global seqno. return Status::OK(); } else if (!ingestion_options_.allow_global_seqno) { return Status::InvalidArgument("Global seqno is required, but disabled"); @@ -1227,6 +1441,14 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( PutFixed64(&seqno_val, seqno); status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val, IOOptions(), nullptr); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to write global seqno to %s: %s", + file_to_ingest->internal_file_path.c_str(), + status.ToString().c_str()); + return status; + } + if (status.ok()) { TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); status = SyncIngestedFile(fsptr.get()); @@ -1243,6 +1465,11 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( return status; } } else if (!status.IsNotSupported()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Failed to open ingested file %s for random read/write: %s", + file_to_ingest->internal_file_path.c_str(), + status.ToString().c_str()); return status; } } @@ -1267,14 +1494,19 @@ IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( // TODO: rate limit file reads for checksum calculation during file ingestion. // TODO: plumb Env::IOActivity ReadOptions ro; + FileOptions gen_fopts; + gen_fopts.file_checksum_func_name = kNoFileChecksumFuncName; IOStatus io_s = GenerateOneFileChecksum( fs_.get(), file_to_ingest->internal_file_path, db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name, &file_checksum, &file_checksum_func_name, ingestion_options_.verify_checksums_readahead_size, db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(), - ro, db_options_.stats, db_options_.clock); + ro, db_options_.stats, db_options_.clock, gen_fopts); if (!io_s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, "Failed to generate checksum for %s: %s", + file_to_ingest->internal_file_path.c_str(), io_s.ToString().c_str()); return io_s; } file_to_ingest->file_checksum = std::move(file_checksum); @@ -1314,4 +1546,91 @@ Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { } } +Status ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile( + TableReader* table_reader, SuperVersion* sv, + IngestedFileInfo* file_to_ingest, bool allow_data_in_errors) { + const auto tp = table_reader->GetTableProperties(); + const bool has_largest_seqno = tp->HasKeyLargestSeqno(); + SequenceNumber largest_seqno = tp->key_largest_seqno; + if (has_largest_seqno) { + file_to_ingest->largest_seqno = largest_seqno; + if (largest_seqno == 0) { + file_to_ingest->smallest_seqno = 0; + return Status::OK(); + } + if (tp->HasKeySmallestSeqno()) { + file_to_ingest->smallest_seqno = tp->key_smallest_seqno; + return Status::OK(); + } + } + + // For older SST files they may not be recorded in table properties, so + // we scan the file to find out. + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan"); + SequenceNumber smallest_seqno = kMaxSequenceNumber; + SequenceNumber largest_seqno_from_iter = 0; + ReadOptions ro; + ro.fill_cache = ingestion_options_.fill_cache; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + ParsedInternalKey key; + iter->SeekToFirst(); + while (iter->Valid()) { + Status pik_status = + ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + smallest_seqno = std::min(smallest_seqno, key.sequence); + largest_seqno_from_iter = std::max(largest_seqno_from_iter, key.sequence); + iter->Next(); + } + if (!iter->status().ok()) { + return iter->status(); + } + + if (table_reader->GetTableProperties()->num_range_deletions > 0) { + std::unique_ptr range_del_iter( + table_reader->NewRangeTombstoneIterator(ro)); + if (range_del_iter != nullptr) { + for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); + range_del_iter->Next()) { + Status pik_status = + ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + smallest_seqno = std::min(smallest_seqno, key.sequence); + largest_seqno_from_iter = + std::max(largest_seqno_from_iter, key.sequence); + } + if (!range_del_iter->status().ok()) { + return range_del_iter->status(); + } + } + } + + file_to_ingest->smallest_seqno = smallest_seqno; + if (!has_largest_seqno) { + file_to_ingest->largest_seqno = largest_seqno_from_iter; + } else { + assert(largest_seqno == largest_seqno_from_iter); + file_to_ingest->largest_seqno = largest_seqno; + } + + if (file_to_ingest->largest_seqno == kMaxSequenceNumber) { + return Status::InvalidArgument( + "Unknown smallest seqno for db generated file."); + } + if (file_to_ingest->smallest_seqno == kMaxSequenceNumber) { + return Status::InvalidArgument( + "Unknown largest seqno for db generated file."); + } + return Status::OK(); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index 4a853afed971..d9ecf43da1b4 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -27,50 +27,77 @@ class SystemClock; struct KeyRangeInfo { // Smallest internal key in an external file or for a batch of external files. + // unset() could be either invalid or "before all keys" InternalKey smallest_internal_key; // Largest internal key in an external file or for a batch of external files. + // unset() could be either invalid or "after all keys" InternalKey largest_internal_key; - bool empty() const { - return smallest_internal_key.size() == 0 && - largest_internal_key.size() == 0; + bool unset() const { + // Legal internal keys are at least 8 bytes. + return smallest_internal_key.unset() || largest_internal_key.unset(); } }; // Helper class to apply SST file key range checks to the external files. +// XXX: using sstableKeyCompare with user comparator on internal keys is +// very broken class ExternalFileRangeChecker { public: explicit ExternalFileRangeChecker(const Comparator* ucmp) : ucmp_(ucmp) {} // Operator used for sorting ranges. - bool operator()(const KeyRangeInfo* prev_range, - const KeyRangeInfo* range) const { - assert(prev_range); - assert(range); - return sstableKeyCompare(ucmp_, prev_range->smallest_internal_key, - range->smallest_internal_key) < 0; + bool operator()(const KeyRangeInfo* range1, + const KeyRangeInfo* range2) const { + assert(range1); + assert(range2); + assert(!range1->unset()); + assert(!range2->unset()); + return sstableKeyCompare(ucmp_, range1->smallest_internal_key, + range2->smallest_internal_key) < 0; } - // Check whether `range` overlaps with `prev_range`. `ranges_sorted` can be - // set to true when the inputs are already sorted based on the sorting logic - // provided by this checker's operator(), which can help simplify the check. - bool OverlapsWithPrev(const KeyRangeInfo* prev_range, - const KeyRangeInfo* range, - bool ranges_sorted = false) const { - assert(prev_range); - assert(range); - if (prev_range->empty() || range->empty()) { + bool Overlaps(const KeyRangeInfo& range1, const KeyRangeInfo& range2, + bool known_sorted = false) const { + return Overlaps(range1, range2.smallest_internal_key, + range2.largest_internal_key, known_sorted); + } + bool Overlaps(const KeyRangeInfo& range1, const InternalKey& range2_smallest, + const InternalKey& range2_largest, + bool known_sorted = false) const { + bool any_unset = + range1.unset() || range2_smallest.unset() || range2_largest.unset(); + if (any_unset) { + assert(!any_unset); return false; } - if (ranges_sorted) { - return sstableKeyCompare(ucmp_, prev_range->largest_internal_key, - range->smallest_internal_key) >= 0; + if (known_sorted) { + return sstableKeyCompare(ucmp_, range1.largest_internal_key, + range2_smallest) >= 0; } - return sstableKeyCompare(ucmp_, prev_range->largest_internal_key, - range->smallest_internal_key) >= 0 && - sstableKeyCompare(ucmp_, prev_range->smallest_internal_key, - range->largest_internal_key) <= 0; + return sstableKeyCompare(ucmp_, range1.largest_internal_key, + range2_smallest) >= 0 && + sstableKeyCompare(ucmp_, range1.smallest_internal_key, + range2_largest) <= 0; + } + + bool Contains(const KeyRangeInfo& range1, const KeyRangeInfo& range2) { + return Contains(range1, range2.smallest_internal_key, + range2.largest_internal_key); + } + bool Contains(const KeyRangeInfo& range1, const InternalKey& range2_smallest, + const InternalKey& range2_largest) { + bool any_unset = + range1.unset() || range2_smallest.unset() || range2_largest.unset(); + if (any_unset) { + assert(!any_unset); + return false; + } + return sstableKeyCompare(ucmp_, range1.smallest_internal_key, + range2_smallest) <= 0 && + sstableKeyCompare(ucmp_, range1.largest_internal_key, + range2_largest) >= 0; } void MaybeUpdateRange(const InternalKey& start_key, @@ -153,6 +180,9 @@ struct IngestedFileInfo : public KeyRangeInfo { // the user key's format in the external file matches the column family's // setting. bool user_defined_timestamps_persisted = true; + + SequenceNumber largest_seqno = kMaxSequenceNumber; + SequenceNumber smallest_seqno = kMaxSequenceNumber; }; // A batch of files. @@ -203,7 +233,7 @@ class ExternalSstFileIngestionJob { directories_(directories), event_logger_(event_logger), job_start_time_(clock_->NowMicros()), - consumed_seqno_count_(0), + max_assigned_seqno_(0), io_tracer_(io_tracer) { assert(directories != nullptr); assert(cfd_); @@ -218,6 +248,7 @@ class ExternalSstFileIngestionJob { Status Prepare(const std::vector& external_files_paths, const std::vector& files_checksums, const std::vector& files_checksum_func_names, + const std::optional& atomic_replace_range, const Temperature& file_temperature, uint64_t next_file_number, SuperVersion* sv); @@ -259,8 +290,16 @@ class ExternalSstFileIngestionJob { return files_to_ingest_; } - // How many sequence numbers did we consume as part of the ingestion job? - int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; } + // Return the maximum assigned sequence number for all files in this job. + // When allow_db_generated_files = false, we may assign global sequence + // numbers to ingested files. The global sequence numbers are sequence numbers + // following versions_->LastSequence(). + // When allow_db_generated_files = true, we ingest files that already have + // sequence numbers assigned. max_assigned_seqno_ will be the max sequence + // number among ingested files. + SequenceNumber MaxAssignedSequenceNumber() const { + return max_assigned_seqno_; + } private: Status ResetTableReader(const std::string& external_file, @@ -321,7 +360,7 @@ class ExternalSstFileIngestionJob { std::optional prev_batch_uppermost_level); // File that we want to ingest behind always goes to the lowest level; - // we just check that it fits in the level, that DB allows ingest_behind, + // we just check that it fits in the level, that the CF allows ingest_behind, // and that we don't have 0 seqnums at the upper levels. // REQUIRES: Mutex held Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest); @@ -341,6 +380,13 @@ class ExternalSstFileIngestionJob { template Status SyncIngestedFile(TWritableFile* file); + // Helper function to obtain the smallest and largest sequence number from a + // file. When OK is returned, file_to_ingest->smallest_seqno and + // file_to_ingest->largest_seqno will be updated. + Status GetSeqnoBoundaryForFile(TableReader* table_reader, SuperVersion* sv, + IngestedFileInfo* file_to_ingest, + bool allow_data_in_errors); + // Create equivalent `Compaction` objects to this file ingestion job // , which will be used to check range conflict with other ongoing // compactions. @@ -362,11 +408,12 @@ class ExternalSstFileIngestionJob { autovector files_to_ingest_; std::vector file_batches_to_ingest_; const IngestExternalFileOptions& ingestion_options_; + std::optional atomic_replace_range_; Directories* directories_; EventLogger* event_logger_; VersionEdit edit_; uint64_t job_start_time_; - int consumed_seqno_count_; + SequenceNumber max_assigned_seqno_; // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are // ingested in L0 bool files_overlap_{false}; diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index de261af7a01b..c4cc09797af2 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -7,6 +7,7 @@ #include #include +#include #include "db/db_test_util.h" #include "db/dbformat.h" @@ -79,8 +80,7 @@ class ExternSSTFileLinkFailFallbackTest } void TearDown() override { - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, options_)); } @@ -2417,102 +2417,130 @@ TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) { } TEST_P(ExternalSSTFileTest, IngestBehind) { - Options options = CurrentOptions(); - options.compaction_style = kCompactionStyleUniversal; - options.num_levels = 3; - options.disable_auto_compactions = false; - DestroyAndReopen(options); - std::vector> file_data; - std::map true_data; + for (bool cf_option : {false, true}) { + SCOPED_TRACE("cf_option = " + std::to_string(cf_option)); + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 3; + options.disable_auto_compactions = false; + DestroyAndReopen(options); + std::vector> file_data; + std::map true_data; - // Insert 100 -> 200 into the memtable - for (int i = 100; i <= 200; i++) { - ASSERT_OK(Put(Key(i), "memtable")); - } + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + } - // Insert 100 -> 200 using IngestExternalFile - file_data.clear(); - for (int i = 0; i <= 20; i++) { - file_data.emplace_back(Key(i), "ingest_behind"); - true_data[Key(i)] = "ingest_behind"; - } + // Insert 100 -> 200 using IngestExternalFile + file_data.clear(); + for (int i = 0; i <= 20; i++) { + file_data.emplace_back(Key(i), "ingest_behind"); + true_data[Key(i)] = "ingest_behind"; + } - bool allow_global_seqno = true; - bool ingest_behind = true; - bool write_global_seqno = std::get<0>(GetParam()); - bool verify_checksums_before_ingest = std::get<1>(GetParam()); + bool allow_global_seqno = true; + bool ingest_behind = true; + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); - // Can't ingest behind since allow_ingest_behind isn't set to true - ASSERT_NOK(GenerateAndAddExternalFile( - options, file_data, -1, allow_global_seqno, write_global_seqno, - verify_checksums_before_ingest, ingest_behind, false /*sort_data*/, - &true_data)); + // Can't ingest behind since allow_ingest_behind isn't set to true + ASSERT_NOK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, ingest_behind, false /*sort_data*/, + &true_data)); - options.allow_ingest_behind = true; - // check that we still can open the DB, as num_levels should be - // sanitized to 3 - options.num_levels = 2; - DestroyAndReopen(options); + if (cf_option) { + options.cf_allow_ingest_behind = true; + } else { + options.allow_ingest_behind = true; + } + // check that we still can open the DB, as num_levels should be + // sanitized to 3 + options.num_levels = 2; + DestroyAndReopen(options); - options.num_levels = 3; - DestroyAndReopen(options); - true_data.clear(); - // Insert 100 -> 200 into the memtable - for (int i = 100; i <= 200; i++) { - ASSERT_OK(Put(Key(i), "memtable")); - true_data[Key(i)] = "memtable"; - } - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - // Universal picker should go at second from the bottom level - ASSERT_EQ("0,1", FilesPerLevel()); - ASSERT_OK(GenerateAndAddExternalFile( - options, file_data, -1, allow_global_seqno, write_global_seqno, - verify_checksums_before_ingest, true /*ingest_behind*/, - false /*sort_data*/, &true_data)); - ASSERT_EQ("0,1,1", FilesPerLevel()); - // this time ingest should fail as the file doesn't fit to the bottom level - ASSERT_NOK(GenerateAndAddExternalFile( - options, file_data, -1, allow_global_seqno, write_global_seqno, - verify_checksums_before_ingest, true /*ingest_behind*/, - false /*sort_data*/, &true_data)); - ASSERT_EQ("0,1,1", FilesPerLevel()); - std::vector> level_to_files; - dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files); - uint64_t ingested_file_number = level_to_files[2][0].fd.GetNumber(); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - // Last level should not be compacted - ASSERT_EQ("0,1,1", FilesPerLevel()); - dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files); - ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber()); - size_t kcnt = 0; - VerifyDBFromMap(true_data, &kcnt, false); + options.num_levels = 3; + DestroyAndReopen(options); + true_data.clear(); + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Universal picker should go at second from the bottom level + ASSERT_EQ("0,1", FilesPerLevel()); + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, true /*ingest_behind*/, + false /*sort_data*/, &true_data)); + ASSERT_EQ("0,1,1", FilesPerLevel()); + // this time ingest should fail as the file doesn't fit to the bottom level + ASSERT_NOK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, true /*ingest_behind*/, + false /*sort_data*/, &true_data)); + ASSERT_EQ("0,1,1", FilesPerLevel()); + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), + &level_to_files); + uint64_t ingested_file_number = level_to_files[2][0].fd.GetNumber(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Last level should not be compacted + ASSERT_EQ("0,1,1", FilesPerLevel()); + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber()); + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); - // Auto-compaction should not include the last level. - // Trigger compaction if size amplification exceeds 110%. - options.compaction_options_universal.max_size_amplification_percent = 110; - options.level0_file_num_compaction_trigger = 4; - ASSERT_OK(TryReopen(options)); - Random rnd(301); - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 10; j++) { - true_data[Key(j)] = rnd.RandomString(1000); - ASSERT_OK(Put(Key(j), true_data[Key(j)])); + // Auto-compaction should not include the last level. + // Trigger compaction if size amplification exceeds 110%. + options.compaction_options_universal.max_size_amplification_percent = 110; + options.level0_file_num_compaction_trigger = 4; + ASSERT_OK(TryReopen(options)); + Random rnd(301); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 10; j++) { + true_data[Key(j)] = rnd.RandomString(1000); + ASSERT_OK(Put(Key(j), true_data[Key(j)])); + } + ASSERT_OK(Flush()); } - ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(1, level_to_files[2].size()); + ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber()); + + // Turning off the option allows DB to compact ingested files. + if (cf_option) { + // Test that another CF does not allow ingest behind + ColumnFamilyHandle* new_cfh; + Options new_cf_option; + ASSERT_OK(db_->CreateColumnFamily(new_cf_option, "new_cf", &new_cfh)); + ASSERT_TRUE(GenerateAndAddExternalFile( + new_cf_option, file_data, -1, allow_global_seqno, + write_global_seqno, verify_checksums_before_ingest, + true /*ingest_behind*/, false /*sort_data*/, nullptr, + /*cfh=*/new_cfh) + .IsInvalidArgument()); + ASSERT_OK(db_->DropColumnFamily(new_cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(new_cfh)); + + options.cf_allow_ingest_behind = false; + } else { + options.allow_ingest_behind = false; + } + ASSERT_OK(TryReopen(options)); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(1, level_to_files[2].size()); + ASSERT_NE(ingested_file_number, level_to_files[2][0].fd.GetNumber()); + VerifyDBFromMap(true_data, &kcnt, false); } - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files); - ASSERT_EQ(1, level_to_files[2].size()); - ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber()); - - // Turning off the option allows DB to compact ingested files. - options.allow_ingest_behind = false; - ASSERT_OK(TryReopen(options)); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files); - ASSERT_EQ(1, level_to_files[2].size()); - ASSERT_NE(ingested_file_number, level_to_files[2][0].fd.GetNumber()); - VerifyDBFromMap(true_data, &kcnt, false); } TEST_F(ExternalSSTFileTest, SkipBloomFilter) { @@ -2541,14 +2569,19 @@ TEST_F(ExternalSSTFileTest, SkipBloomFilter) { options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1); } - // Create external SST file but skip bloom filters + // Create external SST file but skip bloom filters by using options + // with no filter policy options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); DestroyAndReopen(options); { std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst"; - SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true, - Env::IOPriority::IO_TOTAL, - true /* skip_filters */); + // Use options with no filter policy to skip bloom filters + Options no_filter_options = options; + BlockBasedTableOptions no_filter_table_options = table_options; + no_filter_table_options.filter_policy.reset(); + no_filter_options.table_factory.reset( + NewBlockBasedTableFactory(no_filter_table_options)); + SstFileWriter sst_file_writer(EnvOptions(), no_filter_options); ASSERT_OK(sst_file_writer.Open(file_path)); ASSERT_OK(sst_file_writer.Put("Key1", "Value1")); ASSERT_OK(sst_file_writer.Finish()); @@ -3514,19 +3547,26 @@ TEST_F(ExternalSSTFileWithTimestampTest, SanityCheck) { // overlapping key ranges. ASSERT_TRUE(IngestExternalUDTFile({file1, file2}).IsNotSupported()); - options.allow_ingest_behind = true; - DestroyAndReopen(options); - IngestExternalFileOptions opts; + for (bool cf_option : {false, true}) { + SCOPED_TRACE("cf_option = " + std::to_string(cf_option)); + if (cf_option) { + options.cf_allow_ingest_behind = true; + } else { + options.allow_ingest_behind = true; + } + DestroyAndReopen(options); + IngestExternalFileOptions opts; - // TODO(yuzhangyu): support ingestion behind for user-defined timestamps? - // Ingesting external files with user-defined timestamps requires searching - // through the whole lsm tree to make sure there is no key range overlap with - // the db. Ingestion behind currently is doing a simply placing it at the - // bottom level step without a search, so we don't allow it either. - opts.ingest_behind = true; - ASSERT_TRUE(db_->IngestExternalFile({file1}, opts).IsNotSupported()); + // TODO(yuzhangyu): support ingestion behind for user-defined timestamps? + // Ingesting external files with user-defined timestamps requires searching + // through the whole lsm tree to make sure there is no key range overlap + // with the db. Ingestion behind currently is doing a simply placing it at + // the bottom level step without a search, so we don't allow it either. + opts.ingest_behind = true; + ASSERT_TRUE(db_->IngestExternalFile({file1}, opts).IsNotSupported()); - DestroyAndRecreateExternalSSTFilesDir(); + DestroyAndRecreateExternalSSTFilesDir(); + } } TEST_F(ExternalSSTFileWithTimestampTest, UDTSettingsCompatibilityCheck) { @@ -3818,106 +3858,37 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) { ASSERT_OK(Put(1, Key(k), "cf1_" + Key(k))); } ASSERT_OK(Flush(/*cf=*/1)); - { - // Verify that largest key of the file has non-zero seqno. - std::vector> metadata; - dbfull()->TEST_GetFilesMetaData(handles_[1], &metadata, nullptr); - const FileMetaData& file = metadata[0][0]; - ValueType vtype; - SequenceNumber seq; - UnPackSequenceAndType(ExtractInternalKeyFooter(file.largest.Encode()), - &seq, &vtype); - ASSERT_GE(seq, 0); - } - std::vector live_meta; - db_->GetLiveFilesMetaData(&live_meta); - ASSERT_EQ(live_meta.size(), 1); - std::vector to_ingest_files; - to_ingest_files.emplace_back(live_meta[0].directory + "/" + - live_meta[0].relative_filename); - // Ingesting a file whose boundary key has non-zero seqno. - Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts); - // This error msg is from checking seqno of boundary keys. - ASSERT_TRUE( - s.ToString().find("External file has non zero sequence number") != - std::string::npos); - ASSERT_NOK(s); - - { - // Only non-boundary key with non-zero seqno. - const Snapshot* snapshot = db_->GetSnapshot(); - ASSERT_OK(Put(1, Key(70), "cf1_" + Key(70))); - ASSERT_OK(Flush(1)); - CompactRangeOptions cro; - cro.bottommost_level_compaction = - BottommostLevelCompaction::kForceOptimized; - ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr)); - - // Verify that only the non-boundary key of the file has non-zero seqno. - std::vector> metadata; - // File may be at different level for different options. - dbfull()->TEST_GetFilesMetaData(handles_[1], &metadata, nullptr); - bool found_file = false; - for (const auto& level : metadata) { - if (level.empty()) { - continue; - } - ASSERT_FALSE(found_file); - found_file = true; - ASSERT_EQ(1, level.size()); - const FileMetaData& file = level[0]; - ValueType vtype; - SequenceNumber seq; - UnPackSequenceAndType(ExtractInternalKeyFooter(file.largest.Encode()), - &seq, &vtype); - ASSERT_EQ(seq, 0); - UnPackSequenceAndType(ExtractInternalKeyFooter(file.smallest.Encode()), - &seq, &vtype); - ASSERT_EQ(seq, 0); - ASSERT_GT(file.fd.largest_seqno, 0); - } - ASSERT_TRUE(found_file); - live_meta.clear(); - db_->GetLiveFilesMetaData(&live_meta); - ASSERT_EQ(live_meta.size(), 1); - to_ingest_files[0] = - live_meta[0].directory + "/" + live_meta[0].relative_filename; - s = db_->IngestExternalFile(to_ingest_files, ingest_opts); - ASSERT_NOK(s); - // This error msg is from checking largest seqno in table property. - ASSERT_TRUE(s.ToString().find("non zero largest sequence number") != - std::string::npos); - db_->ReleaseSnapshot(snapshot); - } + Status s; CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr)); - live_meta.clear(); + + std::vector live_meta; + std::vector to_ingest_files; db_->GetLiveFilesMetaData(&live_meta); ASSERT_EQ(live_meta.size(), 1); + ASSERT_EQ(live_meta[0].column_family_name, "toto"); ASSERT_EQ(0, live_meta[0].largest_seqno); - to_ingest_files[0] = - live_meta[0].directory + "/" + live_meta[0].relative_filename; + to_ingest_files.emplace_back(live_meta[0].directory + "/" + + live_meta[0].relative_filename); + // Ingesting a DB generated file with allow_db_generated_files = false ingest_opts.allow_db_generated_files = false; - // Ingesting a DB genrate file with allow_db_generated_files = false; s = db_->IngestExternalFile(to_ingest_files, ingest_opts); ASSERT_TRUE(s.ToString().find("External file version not found") != std::string::npos); ASSERT_NOK(s); const std::string err = - "An ingested file is assigned to a non-zero sequence number, which is " - "incompatible with ingestion option allow_db_generated_files"; + "An ingested file overlaps with existing data in the DB and has been " + "assigned a non-zero sequence number"; ingest_opts.allow_db_generated_files = true; s = db_->IngestExternalFile(to_ingest_files, ingest_opts); ASSERT_TRUE(s.ToString().find(err) != std::string::npos); ASSERT_NOK(s); - if (options.compaction_style != kCompactionStyleUniversal) { - // FIXME: after fixing ingestion with universal compaction, currently - // will always ingest into L0. + if (options.num_levels > 1) { ingest_opts.fail_if_not_bottommost_level = true; s = db_->IngestExternalFile(to_ingest_files, ingest_opts); ASSERT_NOK(s); @@ -4073,7 +4044,7 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) { std::string db2_path = test::PerThreadDBPath("DB2"); Options db2_options; db2_options.create_if_missing = true; - DB* db2 = nullptr; + std::unique_ptr db2; ASSERT_OK(DB::Open(db2_options, db2_path, &db2)); // Write some base data. expected_value.emplace_back(rnd.RandomString(100)); @@ -4102,10 +4073,10 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) { ASSERT_OK(db_->DropColumnFamily(temp_cfh)); ASSERT_OK(db_->DestroyColumnFamilyHandle(temp_cfh)); ASSERT_OK(db2->Close()); - delete db2; + db2.reset(); ASSERT_OK(DB::Open(db2_options, db2_path, &db2)); ASSERT_OK(db2->Close()); - delete db2; + db2.reset(); ASSERT_OK(DestroyDB(db2_path, db2_options)); } else { ASSERT_OK(db_->DropColumnFamily(temp_cfh)); @@ -4113,6 +4084,472 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) { } } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); } + +TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) { + // Test ingestion of DB-generated SST files that contain non-zero sequence + // numbers. + IngestExternalFileOptions ingest_opts; + ingest_opts.allow_db_generated_files = true; + // This only works since we are ingesting without snapshot + // Failure case will be tested below. + ingest_opts.snapshot_consistency = std::get<0>(GetParam()); + ingest_opts.allow_global_seqno = std::get<1>(GetParam()); + ingest_opts.allow_blocking_flush = std::get<2>(GetParam()); + ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam()); + ingest_opts.link_files = std::get<4>(GetParam()); + Random* rnd = Random::GetTLSInstance(); + rnd->Reset(std::random_device{}()); + std::ostringstream ingest_opts_trace; + ingest_opts_trace << "ingest_opts params: " << "snapshot_consistency=" + << ingest_opts.snapshot_consistency << ", " + << "allow_global_seqno=" << ingest_opts.allow_global_seqno + << ", " << "allow_blocking_flush=" + << ingest_opts.allow_blocking_flush << ", " + << "fail_if_not_bottommost_level=" + << ingest_opts.fail_if_not_bottommost_level << ", " + << "link_files=" << ingest_opts.link_files; + SCOPED_TRACE(ingest_opts_trace.str()); + + do { + SCOPED_TRACE("option_config_ = " + std::to_string(option_config_)); + + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + options.allow_concurrent_memtable_write = + false; // Required for VectorRepFactory + CreateAndReopenWithCF({"non_overlap", "overlap"}, options); + + ColumnFamilyHandle* non_overlap_cf = handles_[1]; + ColumnFamilyHandle* overlap_cf = handles_[2]; + + std::vector expected_values; + expected_values.resize(100); + WriteOptions wo; + // Setup target CF with non-overlapping base data Key1 and Key99 + // Will ingest keys [1, 98] below. + expected_values[0] = rnd->RandomString(100); + ASSERT_OK(db_->Put(wo, non_overlap_cf, Key(0), expected_values[0])); + ASSERT_OK(db_->Flush({}, non_overlap_cf)); + expected_values[99] = rnd->RandomString(100); + ASSERT_OK(db_->Put(wo, non_overlap_cf, Key(99), expected_values[99])); + + // Set up overlapping cf + ASSERT_OK(db_->Put(wo, overlap_cf, Key(50), rnd->RandomString(100))); + + // Create temp CF/DB + Options temp_cf_opts; + ColumnFamilyHandle* temp_cfh = nullptr; + std::unique_ptr temp_db_holder; + DB* from_db = nullptr; + std::string temp_db_name; + // Using a separate DB also validates that latest sequence number + // of target db is updated after ingestion (to the max sequence number + // in ingested files). + const bool use_temp_db = rnd->OneIn(2); + SCOPED_TRACE("use_temp_db: " + std::to_string(use_temp_db)); + + std::vector sst_file_paths; + // optional L5: files in key range [70, 98] + // L6: files in key range [1, 79] + temp_cf_opts.target_file_size_base = + 20 << 10; // Small files to create multiple SSTs + temp_cf_opts.num_levels = 7; + temp_cf_opts.disable_auto_compactions = true; // Manually set up LSM + temp_cf_opts.env = options.env; + + if (use_temp_db) { + temp_cf_opts.create_if_missing = true; + temp_db_name = dbname_ + "/temp_db_" + std::to_string(rnd->Next()); + ASSERT_OK(DB::Open(temp_cf_opts, temp_db_name, &temp_db_holder)); + from_db = temp_db_holder.get(); + temp_cfh = from_db->DefaultColumnFamily(); + } else { + from_db = db_.get(); + ASSERT_OK( + from_db->CreateColumnFamily(temp_cf_opts, "temp_cf", &temp_cfh)); + } + + // Use snapshot to ensure non-zero sequence numbers after compaction + const Snapshot* snapshot = from_db->GetSnapshot(); + + for (int k = 1; k < 99; ++k) { + expected_values[k] = rnd->RandomString(2000); + ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k])); + } + ASSERT_OK(from_db->Flush({}, temp_cfh)); + CompactRangeOptions cro; + cro.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(from_db->CompactRange(cro, temp_cfh, nullptr, nullptr)); + + ASSERT_GT(NumTableFilesAtLevel(6, temp_cfh, from_db), 1); + + const bool multi_level_ingestion = rnd->OneIn(2); + SCOPED_TRACE("Multi-level ingestion: " + + std::to_string(multi_level_ingestion)); + if (multi_level_ingestion) { + for (int k = 80; k < 99; ++k) { + expected_values[k] = rnd->RandomString(500); + ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k])); + } + ASSERT_OK(from_db->Flush({}, temp_cfh)); + + // Do some overwrites, and overlap with previous L0 to avoid trivial move + for (int k = 70; k < 82; ++k) { + expected_values[k] = rnd->RandomString(500); + ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k])); + } + ASSERT_OK(from_db->Flush({}, temp_cfh)); + + if (rnd->OneIn(2)) { + MoveFilesToLevel(5, temp_cfh, from_db); + ASSERT_GT(NumTableFilesAtLevel(5, temp_cfh, from_db), 0); + } + ASSERT_GT(NumTableFilesAtLevel(6, temp_cfh, from_db), 0); + } + SCOPED_TRACE("LSM of from_db " + FilesPerLevel(temp_cfh, from_db)); + + ColumnFamilyMetaData cf_meta; + from_db->GetColumnFamilyMetaData(temp_cfh, &cf_meta); + + // Iterate in reverse since IngestExternalFiles expect files to be ordered + // from old to new + for (auto level_meta = cf_meta.levels.rbegin(); + level_meta != cf_meta.levels.rend(); ++level_meta) { + // L0 files need to be added in reverse order. + for (auto file_meta = level_meta->files.rbegin(); + file_meta != level_meta->files.rend(); ++file_meta) { + // Validate that files contain non-zero sequence numbers + ASSERT_GT(file_meta->smallest_seqno, 0); + ASSERT_GE(file_meta->largest_seqno, file_meta->smallest_seqno); + sst_file_paths.emplace_back(file_meta->directory + "/" + + file_meta->relative_filename); + } + } + from_db->ReleaseSnapshot(snapshot); + + Status s; + // Perform ingestion and validate results + if (multi_level_ingestion && options.num_levels > 1) { + // fail_if_bottommost requres ingesting all files into the last level, + // so it fails if we are assiging files to multiple levels. + ingest_opts.fail_if_not_bottommost_level = true; + s = db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts); + ASSERT_NOK(s); + ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") != + std::string::npos); + ingest_opts.fail_if_not_bottommost_level = false; + } + if (ingest_opts.snapshot_consistency) { + // snapshot_consisteny requires global sequence number assignment to + // ingested files if there is any live snapshot. + snapshot = db_->GetSnapshot(); + s = db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts); + ASSERT_NOK(s); + ASSERT_TRUE(s.ToString().find( + "An ingested file overlaps with existing data in the DB and has been " + "assigned a non-zero sequence number")); + db_->ReleaseSnapshot(snapshot); + } + + std::atomic file_scan_count{0}; + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan", + [&](void* /*arg*/) { file_scan_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK( + db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + EXPECT_EQ(file_scan_count, 0); + + // Validate ingested data. + ReadOptions ro; + std::string val; + for (int k = 0; k < 100; ++k) { + s = db_->Get(ro, handles_[1], Key(k), &val); + ASSERT_OK(s) << "Should find ingested key " << Key(k); + ASSERT_EQ(val, expected_values[k]) << "key: " << Key(k); + } + + // Overlap with data in the CF + if (ingest_opts.allow_blocking_flush) { + s = db_->IngestExternalFile(overlap_cf, sst_file_paths, ingest_opts); + + ASSERT_NOK(s); + if (ingest_opts.fail_if_not_bottommost_level) { + ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") != + std::string::npos) + << s.ToString(); + } else { + ASSERT_TRUE(s.ToString().find("An ingested file overlaps with existing " + "data in the DB and has been " + "assigned a non-zero sequence number") != + std::string::npos) + << s.ToString(); + } + } + + // Cleanup + // FIXME: Without this, the test triggers some data race between dropping + // CF and background compaction. + ASSERT_OK(db_->WaitForCompact({})); + if (use_temp_db) { + ASSERT_OK(from_db->Close()); + temp_db_holder.reset(); + ASSERT_OK(DestroyDB(temp_db_name, temp_cf_opts)); + } else { + ASSERT_OK(db_->DropColumnFamily(temp_cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(temp_cfh)); + } + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); +} + +std::string GenSecondaryKey(const std::string& pk, const std::string& val) { + return "index_" + val + "_" + pk; +}; + +TEST_P(IngestDBGeneratedFileTest2, ZeroAndNonZeroSeqno) { + // Test ingestion of SST files with zero and with non-zero sequence numbers. + // Generate data using a temp CF and a temp DB: + // 1. Temp CF with cf_allow_ingest_behind enabled to preserve non-zero seqno. + // 2. Temp DB with everything compacted to have zero seqno. + // Then ingest both types of files together into a target CF. + // This mimics a user case where temp DB contains data read from a + // snapshot while temp CF contains live writes after a snapshot is taken. + IngestExternalFileOptions ingest_opts; + ingest_opts.allow_db_generated_files = true; + ingest_opts.snapshot_consistency = std::get<0>(GetParam()); + ingest_opts.allow_global_seqno = std::get<1>(GetParam()); + ingest_opts.allow_blocking_flush = std::get<2>(GetParam()); + ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam()); + ingest_opts.link_files = std::get<4>(GetParam()); + + Random* rnd = Random::GetTLSInstance(); + + do { + SCOPED_TRACE("option_config_ = " + std::to_string(option_config_)); + Options options = CurrentOptions(); + options.allow_concurrent_memtable_write = false; + // Force more flushes/compactions and more files to be generated + options.target_file_size_base = 1 << 10; // 1KB + options.max_bytes_for_level_base = 2 << 10; // 2KB + options.max_bytes_for_level_multiplier = 2; + options.level0_file_num_compaction_trigger = 2; + options.level_compaction_dynamic_level_bytes = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"target_cf"}, options); + auto* target_cfh = handles_[1]; + + Options live_write_cf_opts = options; + live_write_cf_opts.memtable_factory.reset(new VectorRepFactory()); + live_write_cf_opts.compaction_style = kCompactionStyleUniversal; + live_write_cf_opts.cf_allow_ingest_behind = true; + live_write_cf_opts.num_levels = 50; + ColumnFamilyHandle* live_write_cfh; + ASSERT_OK(db_->CreateColumnFamily(live_write_cf_opts, "live_write_cf", + &live_write_cfh)); + + // Expected value and key + std::map expected; + std::unordered_set deleted; + std::stringstream debug_info; + + // Setup base data in target CF, will ingest keys with different prefixes + // so they don't overlap with the base data. + WriteOptions wo; + for (int k = 0; k < 100; ++k) { + int random_val = rnd->Uniform(20); + expected[Key(k)] = std::to_string(random_val); + ASSERT_OK(db_->Put(wo, target_cfh, Key(k), expected[Key(k)])); + + // Force flush every 20 keys to create multiple SST files + if (rnd->OneIn(20)) { + ASSERT_OK(db_->Flush({}, target_cfh)); + debug_info << "Flush after " << k + << ", LSM state: " << FilesPerLevel(target_cfh) << "\n"; + } + } + + // Temp DB for snapshot data + Options temp_db_opts; + temp_db_opts.create_if_missing = true; + temp_db_opts.target_file_size_base = 1 << 10; + temp_db_opts.write_buffer_size = 1 << 10; + temp_db_opts.memtable_factory.reset(new VectorRepFactory()); + temp_db_opts.allow_concurrent_memtable_write = false; + temp_db_opts.compaction_style = kCompactionStyleUniversal; + temp_db_opts.env = env_; + temp_db_opts.num_levels = 7; + + std::string temp_db_name = + dbname_ + "/temp_db_" + std::to_string(rnd->Next()); + std::unique_ptr temp_db; + ASSERT_OK(DB::Open(temp_db_opts, temp_db_name, &temp_db)); + + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions ro; + ro.snapshot = snapshot; + ro.total_order_seek = true; + std::unique_ptr iter{db_->NewIterator(ro, target_cfh)}; + // transform data read from snapshot and write to temp DB + // Varying the number of files in temp DB. + const int kValSize = rnd->Uniform(200); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string key = iter->key().ToString(); + std::string value = iter->value().ToString(); + std::string sk = GenSecondaryKey(key, value); + // Usually value is empty, here we use a larger value to generate + // multiple SST files in temp_db. + std::string sk_val = rnd->RandomString(kValSize); + ASSERT_OK(temp_db->Put(wo, sk, sk_val)); + expected[sk] = sk_val; + debug_info << "Snapshot data: " << sk << " -> \n"; + } + ASSERT_OK(iter->status()); + + // Do some live writes into target CF and live write CF. + for (int i = 0; i < 10; ++i) { + WriteBatch wb; + for (int j = 0; j < 5; ++j) { + std::string key = Key(rnd->Uniform(100)); + std::string old_val = expected[key]; + // Value range is 0-19, allow some PK to have the same value. + int random_val = rnd->Uniform(20); + std::string new_val = std::to_string(random_val); + std::string old_index_key = GenSecondaryKey(key, old_val); + std::string new_index_key = GenSecondaryKey(key, new_val); + ASSERT_OK(wb.SingleDelete(live_write_cfh, old_index_key)); + std::string sk_val = rnd->RandomString(kValSize); + ASSERT_OK(wb.Put(live_write_cfh, new_index_key, sk_val)); + ASSERT_OK(wb.Put(target_cfh, key, new_val)); + expected[key] = new_val; + expected.erase(old_index_key); + expected[new_index_key] = sk_val; + deleted.insert(old_index_key); + deleted.erase(new_index_key); + + debug_info << "Live write: SD " << old_index_key << "\n"; + debug_info << "Live write: " << key << " -> " << new_val << "\n"; + debug_info << "Live write: " << new_index_key << " -> \n"; + } + ASSERT_OK(db_->Write(wo, &wb)); + if (rnd->OneIn(3)) { + debug_info << "Flush after " << i << " live writes\n"; + ASSERT_OK(db_->Flush({}, live_write_cfh)); + } + } + iter.reset(); + db_->ReleaseSnapshot(snapshot); + + // Compact temp_db to ensure zero sequence numbers + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(temp_db->CompactRange(cro, nullptr, nullptr)); + SCOPED_TRACE("Temp DB LSM: " + + FilesPerLevel(temp_db->DefaultColumnFamily(), temp_db.get())); + + // Base data from snapshot + std::vector sst_file_paths_zero_seqno; + + // Collect SST file paths with zero sequence numbers + ASSERT_OK(temp_db->DisableFileDeletions()); + ColumnFamilyMetaData cf_meta_temp_db; + temp_db->GetColumnFamilyMetaData(&cf_meta_temp_db); + for (const auto& level_meta : cf_meta_temp_db.levels) { + if (level_meta.level == 6) { + for (const auto& file_meta : level_meta.files) { + // Verify files have zero sequence numbers + ASSERT_EQ(0, file_meta.largest_seqno) + << "File " << file_meta.relative_filename + << " should have zero sequence number\n" + << debug_info.str(); + sst_file_paths_zero_seqno.emplace_back(file_meta.directory + "/" + + file_meta.relative_filename); + } + } else { + // All files should be in L6 + ASSERT_EQ(0, level_meta.files.size()) << debug_info.str(); + } + } + + // Flush remaining catch up writes in memtable + ASSERT_OK(db_->Flush({}, live_write_cfh)); + SCOPED_TRACE("LSM of live write cfh " + FilesPerLevel(live_write_cfh)); + // Collect SST file paths with non-zero sequence numbers + ColumnFamilyMetaData live_write_cf_meta; + ASSERT_OK(db_->DisableFileDeletions()); + db_->GetColumnFamilyMetaData(live_write_cfh, &live_write_cf_meta); + + // Live writes after snapshot + std::vector sst_file_paths_nonzero_seqno; + for (auto level_meta = live_write_cf_meta.levels.rbegin(); + level_meta != live_write_cf_meta.levels.rend(); ++level_meta) { + // Reverse order is important for L0, where recent updates are ordered + // first + for (auto file_meta = level_meta->files.rbegin(); + file_meta != level_meta->files.rend(); ++file_meta) { + sst_file_paths_nonzero_seqno.emplace_back(file_meta->directory + "/" + + file_meta->relative_filename); + ASSERT_GT(file_meta->smallest_seqno, 0) << debug_info.str(); + } + if (level_meta->level == 49) { + // Ingest behind does not compact to the last level + ASSERT_EQ(level_meta->files.size(), 0) << debug_info.str(); + } + } + + ASSERT_GT(sst_file_paths_zero_seqno.size(), 0) << debug_info.str(); + ASSERT_GT(sst_file_paths_nonzero_seqno.size(), 0) << debug_info.str(); + + // Combine all SST file paths. + // File ingestion takes files from old to new. + std::vector all_sst_files; + all_sst_files.insert(all_sst_files.end(), sst_file_paths_zero_seqno.begin(), + sst_file_paths_zero_seqno.end()); + all_sst_files.insert(all_sst_files.end(), + sst_file_paths_nonzero_seqno.begin(), + sst_file_paths_nonzero_seqno.end()); + if (ingest_opts.fail_if_not_bottommost_level && options.num_levels > 1) { + // overlapping files will be ingested into different levels, including non + // Lmax + Status s = + db_->IngestExternalFile(target_cfh, all_sst_files, ingest_opts); + ASSERT_NOK(s); + ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") != + std::string::npos); + } else { + ASSERT_OK( + db_->IngestExternalFile(target_cfh, all_sst_files, ingest_opts)); + + debug_info << "Zero seqno files: " << sst_file_paths_zero_seqno.size() + << "\nNon-zero seqno files: " + << sst_file_paths_nonzero_seqno.size() << "\n"; + + SCOPED_TRACE("Debug info:\n" + debug_info.str()); + VerifyDBFromMap(expected, nullptr, false, nullptr, target_cfh, &deleted); + } + + // clean up + ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_OK(temp_db->EnableFileDeletions()); + + // FIXME: Without this, the test triggers some data race between dropping + // CF and background compaction. + ASSERT_OK(db_->WaitForCompact({})); + + ASSERT_OK(db_->DropColumnFamily(live_write_cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(live_write_cfh)); + + ASSERT_OK(temp_db->Close()); + temp_db.reset(); + ASSERT_OK(DestroyDB(temp_db_name, temp_db_opts)); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 3152c7635bea..9e7ec6ddd2ed 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -76,7 +76,7 @@ class FaultInjectionTest std::string dbname_; std::shared_ptr tiny_cache_; Options options_; - DB* db_; + std::unique_ptr db_; FaultInjectionTest() : option_config_(std::get<1>(GetParam())), @@ -260,10 +260,7 @@ class FaultInjectionTest return Slice(*storage); } - void CloseDB() { - delete db_; - db_ = nullptr; - } + void CloseDB() { db_.reset(); } Status OpenDB() { CloseDB(); @@ -348,7 +345,8 @@ class FaultInjectionTest } void WaitCompactionFinish() { - ASSERT_OK(static_cast(db_->GetRootDB())->TEST_WaitForCompact()); + ASSERT_OK(static_cast_with_check(db_->GetRootDB()) + ->TEST_WaitForCompact()); ASSERT_OK(db_->Put(WriteOptions(), "", "")); } diff --git a/db/flush_job.cc b/db/flush_job.cc index ac2eaeb6c55c..e5221afca878 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -92,12 +92,10 @@ FlushJob::FlushJob( const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, const FileOptions& file_options, VersionSet* versions, InstrumentedMutex* db_mutex, std::atomic* shutting_down, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, JobContext* job_context, - FlushReason flush_reason, LogBuffer* log_buffer, FSDirectory* db_directory, - FSDirectory* output_file_directory, CompressionType output_compression, - Statistics* stats, EventLogger* event_logger, bool measure_io_stats, + JobContext* job_context, FlushReason flush_reason, LogBuffer* log_buffer, + FSDirectory* db_directory, FSDirectory* output_file_directory, + CompressionType output_compression, Statistics* stats, + EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, Env::Priority thread_pri, const std::shared_ptr& io_tracer, std::shared_ptr seqno_to_time_mapping, @@ -114,12 +112,7 @@ FlushJob::FlushJob( versions_(versions), db_mutex_(db_mutex), shutting_down_(shutting_down), - existing_snapshots_(std::move(existing_snapshots)), - earliest_snapshot_(existing_snapshots_.empty() - ? kMaxSequenceNumber - : existing_snapshots_.at(0)), - earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), - snapshot_checker_(snapshot_checker), + earliest_snapshot_(job_context->GetEarliestSnapshotSequence()), job_context_(job_context), flush_reason_(flush_reason), log_buffer_(log_buffer), @@ -140,6 +133,7 @@ FlushJob::FlushJob( full_history_ts_low_(std::move(full_history_ts_low)), blob_callback_(blob_callback), seqno_to_time_mapping_(std::move(seqno_to_time_mapping)) { + assert(job_context->snapshot_context_initialized); // Update the thread status to indicate flush. ReportStartedFlush(); TEST_SYNC_POINT("FlushJob::FlushJob()"); @@ -456,7 +450,7 @@ Status FlushJob::MemPurge() { const std::string* const full_history_ts_low = &(cfd_->GetFullHistoryTsLow()); std::unique_ptr range_del_agg( new CompactionRangeDelAggregator(&(cfd_->internal_comparator()), - existing_snapshots_, + job_context_->snapshot_seqs, full_history_ts_low)); for (auto& rd_iter : range_del_iters) { range_del_agg->AddTombstones(std::move(rd_iter)); @@ -495,21 +489,20 @@ Status FlushJob::MemPurge() { Env* env = db_options_.env; assert(env); - MergeHelper merge( - env, (cfd_->internal_comparator()).user_comparator(), - (ioptions.merge_operator).get(), compaction_filter.get(), - ioptions.logger, true /* internal key corruption is not ok */, - existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), - snapshot_checker_); + MergeHelper merge(env, (cfd_->internal_comparator()).user_comparator(), + (ioptions.merge_operator).get(), compaction_filter.get(), + ioptions.logger, + true /* internal key corruption is not ok */, + job_context_->GetLatestSnapshotSequence(), + job_context_->snapshot_checker); assert(job_context_); - SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence(); const std::atomic kManualCompactionCanceledFalse{false}; CompactionIterator c_iter( iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge, - kMaxSequenceNumber, &existing_snapshots_, earliest_snapshot_, - earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_, - env, ShouldReportDetailedTime(env, ioptions.stats), - true /* internal key corruption is not ok */, range_del_agg.get(), + kMaxSequenceNumber, &job_context_->snapshot_seqs, earliest_snapshot_, + job_context_->earliest_write_conflict_snapshot, + job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker, + env, ShouldReportDetailedTime(env, ioptions.stats), range_del_agg.get(), nullptr, ioptions.allow_data_in_errors, ioptions.enforce_single_del_contracts, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, @@ -761,7 +754,7 @@ bool FlushJob::MemPurgeDecider(double threshold) { // Pick the oldest existing snapshot that is more recent // than the sequence number of the sampled entry. min_seqno_snapshot = kMaxSequenceNumber; - for (SequenceNumber seq_num : existing_snapshots_) { + for (SequenceNumber seq_num : job_context_->snapshot_seqs) { if (seq_num > res.sequence && seq_num < min_seqno_snapshot) { min_seqno_snapshot = seq_num; } @@ -868,9 +861,12 @@ Status FlushJob::WriteLevel0Table() { ts_sz > 0 && !cfd_->ioptions().persist_user_defined_timestamps; std::vector blob_file_additions; - + // Note that here we treat flush as level 0 compaction in internal stats + InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, + 1 /* count**/); { - auto write_hint = base_->storage_info()->CalculateSSTWriteHint(/*level=*/0); + auto write_hint = base_->storage_info()->CalculateSSTWriteHint( + /*level=*/0, db_options_.calculate_sst_write_lifetime_hint_set); Env::IOPriority io_priority = GetRateLimiterPriority(); db_mutex_->Unlock(); if (log_buffer_) { @@ -886,7 +882,7 @@ Status FlushJob::WriteLevel0Table() { ro.total_order_seek = true; ro.io_activity = Env::IOActivity::kFlush; Arena arena; - uint64_t total_num_entries = 0, total_num_deletes = 0; + uint64_t total_num_input_entries = 0, total_num_deletes = 0; uint64_t total_data_size = 0; size_t total_memory_usage = 0; uint64_t total_num_range_deletes = 0; @@ -900,9 +896,9 @@ Status FlushJob::WriteLevel0Table() { for (ReadOnlyMemTable* m : mems_) { ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Flushing memtable id %" PRIu64 - " with next log file: %" PRIu64 "\n", + " with next log file: %" PRIu64 ", marked_for_flush: %d\n", cfd_->GetName().c_str(), job_context_->job_id, m->GetID(), - m->GetNextLogNumber()); + m->GetNextLogNumber(), m->IsMarkedForFlush()); if (logical_strip_timestamp) { memtables.push_back(m->NewTimestampStrippingIterator( ro, /*seqno_to_time_mapping=*/nullptr, &arena, @@ -921,7 +917,7 @@ Status FlushJob::WriteLevel0Table() { if (range_del_iter != nullptr) { range_del_iters.emplace_back(range_del_iter); } - total_num_entries += m->NumEntries(); + total_num_input_entries += m->NumEntries(); total_num_deletes += m->NumDeletion(); total_data_size += m->GetDataSize(); total_memory_usage += m->ApproximateMemoryUsage(); @@ -933,11 +929,12 @@ Status FlushJob::WriteLevel0Table() { // "Write Buffer Full", should make update flush_reason_ accordingly. event_logger_->Log() << "job" << job_context_->job_id << "event" << "flush_started" << "num_memtables" << mems_.size() - << "num_entries" << total_num_entries << "num_deletes" - << total_num_deletes << "total_data_size" - << total_data_size << "memory_usage" - << total_memory_usage << "num_range_deletes" - << total_num_range_deletes << "flush_reason" + << "total_num_input_entries" << total_num_input_entries + << "num_deletes" << total_num_deletes + << "total_data_size" << total_data_size + << "memory_usage" << total_memory_usage + << "num_range_deletes" << total_num_range_deletes + << "flush_reason" << GetFlushReasonString(flush_reason_); { @@ -975,7 +972,6 @@ Status FlushJob::WriteLevel0Table() { meta_.oldest_ancester_time = oldest_ancester_time; meta_.file_creation_time = current_time; - uint64_t num_input_entries = 0; uint64_t memtable_payload_bytes = 0; uint64_t memtable_garbage_bytes = 0; IOStatus io_s; @@ -997,28 +993,49 @@ Status FlushJob::WriteLevel0Table() { preclude_last_level_min_seqno_ == kMaxSequenceNumber ? preclude_last_level_min_seqno_ : std::min(earliest_snapshot_, preclude_last_level_min_seqno_)); - const SequenceNumber job_snapshot_seq = - job_context_->GetJobSnapshotSequence(); - s = BuildTable( dbname_, versions_, db_options_, tboptions, file_options_, cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, - &blob_file_additions, existing_snapshots_, earliest_snapshot_, - earliest_write_conflict_snapshot_, job_snapshot_seq, - snapshot_checker_, mutable_cf_options_.paranoid_file_checks, - cfd_->internal_stats(), &io_s, io_tracer_, - BlobFileCreationReason::kFlush, seqno_to_time_mapping_.get(), - event_logger_, job_context_->job_id, &table_properties_, write_hint, - full_history_ts_low, blob_callback_, base_, &num_input_entries, - &memtable_payload_bytes, &memtable_garbage_bytes); + &blob_file_additions, job_context_->snapshot_seqs, earliest_snapshot_, + job_context_->earliest_write_conflict_snapshot, + job_context_->GetJobSnapshotSequence(), + job_context_->snapshot_checker, + mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), + &io_s, io_tracer_, BlobFileCreationReason::kFlush, + seqno_to_time_mapping_.get(), event_logger_, job_context_->job_id, + &table_properties_, write_hint, full_history_ts_low, blob_callback_, + base_, &memtable_payload_bytes, &memtable_garbage_bytes, + &flush_stats); TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:s", &s); // TODO: Cleanup io_status in BuildTable and table builders assert(!s.ok() || io_s.ok()); io_s.PermitUncheckedError(); - if (num_input_entries != total_num_entries && s.ok()) { - std::string msg = "Expected " + std::to_string(total_num_entries) + + if (s.ok() && total_num_input_entries != flush_stats.num_input_records) { + std::string msg = "Expected " + + std::to_string(total_num_input_entries) + " entries in memtables, but read " + - std::to_string(num_input_entries); + std::to_string(flush_stats.num_input_records); + ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s", + cfd_->GetName().c_str(), job_context_->job_id, + msg.c_str()); + if (db_options_.flush_verify_memtable_count) { + s = Status::Corruption(msg); + } + } + + // Only verify on table with format collects table properties + if (s.ok() && + (mutable_cf_options_.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName()) || + mutable_cf_options_.table_factory->IsInstanceOf( + TableFactory::kPlainTableName())) && + flush_stats.num_output_records != table_properties_.num_entries) { + std::string msg = + "Number of keys in flush output SST files does not match " + "number of keys added to the table. Expected " + + std::to_string(flush_stats.num_output_records) + " but there are " + + std::to_string(table_properties_.num_entries) + + " in output SST files"; ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s", cfd_->GetName().c_str(), job_context_->job_id, msg.c_str()); @@ -1078,42 +1095,42 @@ Status FlushJob::WriteLevel0Table() { meta_.file_creation_time, meta_.epoch_number, meta_.file_checksum, meta_.file_checksum_func_name, meta_.unique_id, meta_.compensated_range_deletion_size, - meta_.tail_size, meta_.user_defined_timestamps_persisted); + meta_.tail_size, meta_.user_defined_timestamps_persisted, + meta_.min_timestamp, meta_.max_timestamp); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } // Piggyback FlushJobInfo on the first first flushed memtable. mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); - // Note that here we treat flush as level 0 compaction in internal stats - InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); const uint64_t micros = clock_->NowMicros() - start_micros; const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros; - stats.micros = micros; - stats.cpu_micros = cpu_micros; + flush_stats.micros = micros; + flush_stats.cpu_micros += cpu_micros; ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Flush lasted %" PRIu64 " microseconds, and %" PRIu64 " cpu microseconds.\n", cfd_->GetName().c_str(), job_context_->job_id, micros, - cpu_micros); + flush_stats.cpu_micros); if (has_output) { - stats.bytes_written = meta_.fd.GetFileSize(); - stats.num_output_files = 1; + flush_stats.bytes_written = meta_.fd.GetFileSize(); + flush_stats.num_output_files = 1; } const auto& blobs = edit_->GetBlobFileAdditions(); for (const auto& blob : blobs) { - stats.bytes_written_blob += blob.GetTotalBlobBytes(); + flush_stats.bytes_written_blob += blob.GetTotalBlobBytes(); } - stats.num_output_files_blob = static_cast(blobs.size()); + flush_stats.num_output_files_blob = static_cast(blobs.size()); - RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros); - cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats); + RecordTimeToHistogram(stats_, FLUSH_TIME, flush_stats.micros); + cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, + flush_stats); cfd_->internal_stats()->AddCFStats( InternalStats::BYTES_FLUSHED, - stats.bytes_written + stats.bytes_written_blob); + flush_stats.bytes_written + flush_stats.bytes_written_blob); RecordFlushIOStats(); return s; @@ -1193,13 +1210,12 @@ void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() { } void FlushJob::GetPrecludeLastLevelMinSeqno() { - if (mutable_cf_options_.preclude_last_level_data_seconds == 0 || - // FIXME: create FlushJob and build SuperVersions such that - // preclude_last_level_data_seconds > 0 implies - // seqno_to_time_mapping_ != nullptr - seqno_to_time_mapping_ == nullptr) { + if (mutable_cf_options_.preclude_last_level_data_seconds == 0) { return; } + // SuperVersion should guarantee this + assert(seqno_to_time_mapping_); + assert(!seqno_to_time_mapping_->Empty()); int64_t current_time = 0; Status s = db_options_.clock->GetCurrentTime(¤t_time); if (!s.ok()) { diff --git a/db/flush_job.h b/db/flush_job.h index 1c1f15d1b1dc..aa95c7b41aef 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -63,11 +63,9 @@ class FlushJob { const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, const FileOptions& file_options, VersionSet* versions, InstrumentedMutex* db_mutex, std::atomic* shutting_down, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, JobContext* job_context, - FlushReason flush_reason, LogBuffer* log_buffer, - FSDirectory* db_directory, FSDirectory* output_file_directory, + JobContext* job_context, FlushReason flush_reason, + LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_file_directory, CompressionType output_compression, Statistics* stats, EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, @@ -167,10 +165,7 @@ class FlushJob { VersionSet* versions_; InstrumentedMutex* db_mutex_; std::atomic* shutting_down_; - std::vector existing_snapshots_; SequenceNumber earliest_snapshot_; - SequenceNumber earliest_write_conflict_snapshot_; - SnapshotChecker* snapshot_checker_; JobContext* job_context_; FlushReason flush_reason_; LogBuffer* log_buffer_; @@ -234,7 +229,7 @@ class FlushJob { // The current minimum seqno that compaction jobs will preclude the data from // the last level. Data with seqnos larger than this or larger than - // `earliest_snapshot_` will be output to the penultimate level had it gone + // `earliest_snapshot_` will be output to the proximal level had it gone // through a compaction to the last level. SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber; }; diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index f37eaf829be5..3d4cf1d8debd 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -142,13 +142,13 @@ class FlushJobTestBase : public testing::Test { column_families.emplace_back(cf_name, cf_options_); } - versions_.reset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - test::kUnitTestDbId, /*db_session_id=*/"", - /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + versions_.reset(new VersionSet( + dbname_, &db_options_, MutableDBOptions{options_}, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + test::kUnitTestDbId, /*db_session_id=*/"", + /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr, /*read_only=*/false)); EXPECT_OK(versions_->Recover(column_families, false)); } @@ -186,16 +186,16 @@ TEST_F(FlushJobTest, Empty) { JobContext job_context(0); auto cfd = versions_->GetColumnFamilySet()->GetDefault(); EventLogger event_logger(db_options_.info_log.get()); - SnapshotChecker* snapshot_checker = nullptr; // not relavant - FlushJob flush_job( - dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, - cfd->GetLatestMutableCFOptions(), - std::numeric_limits::max() /* memtable_id */, env_options_, - versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, - nullptr, kNoCompression, nullptr, &event_logger, false, - true /* sync_output_directory */, true /* write_manifest */, - Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {}); + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, + env_options_, versions_.get(), &mutex_, &shutting_down_, + &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, nullptr, &event_logger, false, + true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER, + nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); { InstrumentedMutexLock l(&mutex_); flush_job.PickMemTable(); @@ -272,16 +272,16 @@ TEST_F(FlushJobTest, NonEmpty) { } EventLogger event_logger(db_options_.info_log.get()); - SnapshotChecker* snapshot_checker = nullptr; // not relavant - FlushJob flush_job( - dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, - cfd->GetLatestMutableCFOptions(), - std::numeric_limits::max() /* memtable_id */, env_options_, - versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, - nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, - true, true /* sync_output_directory */, true /* write_manifest */, - Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {}); + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, + env_options_, versions_.get(), &mutex_, &shutting_down_, + &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), + &event_logger, true, true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER, + nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); HistogramData hist; FileMetaData file_meta; @@ -332,18 +332,18 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) { } EventLogger event_logger(db_options_.info_log.get()); - SnapshotChecker* snapshot_checker = nullptr; // not relavant assert(memtable_ids.size() == num_mems); uint64_t smallest_memtable_id = memtable_ids.front(); uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1; + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {}); FlushJob flush_job( dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_, - versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, - nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, - true, true /* sync_output_directory */, true /* write_manifest */, + versions_.get(), &mutex_, &shutting_down_, &job_context, + FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); HistogramData hist; FileMetaData file_meta; @@ -405,18 +405,17 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { } EventLogger event_logger(db_options_.info_log.get()); - SnapshotChecker* snapshot_checker = nullptr; // not relevant std::vector> flush_jobs; k = 0; + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {}); for (auto cfd : all_cfds) { std::vector snapshot_seqs; flush_jobs.emplace_back(new FlushJob( dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(), memtable_ids[k], env_options_, versions_.get(), &mutex_, - &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker, - &job_context, FlushReason::kTest, nullptr, nullptr, nullptr, - kNoCompression, db_options_.statistics.get(), &event_logger, true, - false /* sync_output_directory */, false /* write_manifest */, + &shutting_down_, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, false /* sync_output_directory */, false /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_)); k++; @@ -532,16 +531,17 @@ TEST_F(FlushJobTest, Snapshots) { } EventLogger event_logger(db_options_.info_log.get()); - SnapshotChecker* snapshot_checker = nullptr; // not relavant - FlushJob flush_job( - dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, - cfd->GetLatestMutableCFOptions(), - std::numeric_limits::max() /* memtable_id */, env_options_, - versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, - snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, - nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, - true, true /* sync_output_directory */, true /* write_manifest */, - Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, + std::move(snapshots)); + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, + env_options_, versions_.get(), &mutex_, &shutting_down_, + &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), + &event_logger, true, true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER, + nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); mutex_.Lock(); flush_job.PickMemTable(); ASSERT_OK(flush_job.Run()); @@ -585,18 +585,18 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) { } EventLogger event_logger(db_options_.info_log.get()); - SnapshotChecker* snapshot_checker = nullptr; // not relavant assert(memtable_ids.size() == num_mems); uint64_t smallest_memtable_id = memtable_ids.front(); uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1; + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {}); FlushJob flush_job( dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_, - versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, - nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, - true, true /* sync_output_directory */, true /* write_manifest */, + versions_.get(), &mutex_, &shutting_down_, &job_context, + FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); // When the state from WriteController is normal. @@ -658,16 +658,16 @@ TEST_F(FlushJobTest, ReplaceTimedPutWriteTimeWithPreferredSeqno) { } EventLogger event_logger(db_options_.info_log.get()); - SnapshotChecker* snapshot_checker = nullptr; // not relevant - FlushJob flush_job( - dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, - cfd->GetLatestMutableCFOptions(), - std::numeric_limits::max() /* memtable_id */, env_options_, - versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, - nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, - true, true /* sync_output_directory */, true /* write_manifest */, - Env::Priority::USER, nullptr /*IOTracer*/, seqno_to_time_mapping); + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {}); + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, + env_options_, versions_.get(), &mutex_, &shutting_down_, + &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), + &event_logger, true, true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER, + nullptr /*IOTracer*/, seqno_to_time_mapping); FileMetaData file_meta; mutex_.Lock(); @@ -761,19 +761,19 @@ TEST_P(FlushJobTimestampTest, AllKeysExpired) { } std::vector snapshots; - constexpr SnapshotChecker* const snapshot_checker = nullptr; JobContext job_context(0); EventLogger event_logger(db_options_.info_log.get()); std::string full_history_ts_low; PutFixed64(&full_history_ts_low, std::numeric_limits::max()); cfd->SetFullHistoryTsLow(full_history_ts_low); + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {}); FlushJob flush_job( dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(), std::numeric_limits::max() /* memtable_id */, env_options_, - versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, - snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, - nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, - true, true /* sync_output_directory */, true /* write_manifest */, + versions_.get(), &mutex_, &shutting_down_, &job_context, + FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_, /*db_id=*/"", /*db_session_id=*/"", full_history_ts_low); @@ -823,8 +823,8 @@ TEST_P(FlushJobTimestampTest, NoKeyExpired) { } std::vector snapshots; - SnapshotChecker* const snapshot_checker = nullptr; JobContext job_context(0); + job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {}); EventLogger event_logger(db_options_.info_log.get()); std::string full_history_ts_low; PutFixed64(&full_history_ts_low, 0); @@ -832,10 +832,10 @@ TEST_P(FlushJobTimestampTest, NoKeyExpired) { FlushJob flush_job( dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(), std::numeric_limits::max() /* memtable_id */, env_options_, - versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, - snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, - nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, - true, true /* sync_output_directory */, true /* write_manifest */, + versions_.get(), &mutex_, &shutting_down_, &job_context, + FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_, /*db_id=*/"", /*db_session_id=*/"", full_history_ts_low); diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 11dde54777e7..81a7f3132980 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -42,6 +42,7 @@ using MinIterHeap = std::priority_queue, MinIterComparator>; +// TODO: name to TailingIterator /** * ForwardIterator is a special type of iterator that only supports Seek() * and Next(). It is expected to perform better than TailingIterator by diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc index b57b119e484a..ecab01168474 100644 --- a/db/forward_iterator_bench.cc +++ b/db/forward_iterator_bench.cc @@ -344,19 +344,18 @@ int main(int argc, char** argv) { status = ROCKSDB_NAMESPACE::DestroyDB(path, options); assert(status.ok()); - ROCKSDB_NAMESPACE::DB* db_raw; - status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw); + std::unique_ptr db; + status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db); assert(status.ok()); - std::unique_ptr db(db_raw); std::vector shard_states(FLAGS_shards + 1); std::deque readers; while (static_cast(readers.size()) < FLAGS_readers) { - readers.emplace_back(&shard_states, db_raw); + readers.emplace_back(&shard_states, db.get()); } std::deque writers; while (static_cast(writers.size()) < FLAGS_writers) { - writers.emplace_back(&shard_states, db_raw); + writers.emplace_back(&shard_states, db.get()); } // Each shard gets a random reader and random writer assigned to it @@ -367,7 +366,7 @@ int main(int argc, char** argv) { shard_states[i].writer = &writers[writer_dist(rng)]; } - StatsThread stats_thread(db_raw); + StatsThread stats_thread(db.get()); for (Writer& w : writers) { w.start(); } diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 44a1c5d099a9..3033f1cf41e2 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -201,15 +201,8 @@ Status ImportColumnFamilyJob::Run() { const auto& f = files_to_import_[i][j]; const auto& file_metadata = *metadatas_[i][j]; - uint64_t tail_size = 0; - bool contain_no_data_blocks = f.table_properties.num_entries > 0 && - (f.table_properties.num_entries == - f.table_properties.num_range_deletions); - if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { - uint64_t file_size = f.fd.GetFileSize(); - assert(f.table_properties.tail_start_offset <= file_size); - tail_size = file_size - f.table_properties.tail_start_offset; - } + uint64_t tail_size = FileMetaData::CalculateTailSize(f.fd.GetFileSize(), + f.table_properties); VersionEdit dummy_version_edit; dummy_version_edit.AddFile( @@ -317,8 +310,10 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( std::unique_ptr sst_file; std::unique_ptr sst_file_reader; - status = - fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); + FileOptions fo{env_options_}; + fo.file_checksum = file_meta.file_checksum; + fo.file_checksum_func_name = file_meta.file_checksum_func_name; + status = fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr); if (!status.ok()) { return status; } @@ -331,7 +326,8 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( status = sv->mutable_cf_options.table_factory->NewTableReader( TableReaderOptions( cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, - env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.compression_manager.get(), env_options_, + cfd_->internal_comparator(), sv->mutable_cf_options.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, /*force_direct_prefetch*/ false, /*level*/ -1, diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index 5a0139017754..0a6f9d6a3905 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -371,7 +371,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { ASSERT_OK(Flush(1)); Checkpoint* checkpoint; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint)); ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); ASSERT_NE(metadata_ptr_, nullptr); @@ -481,14 +481,14 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { ASSERT_OK(Flush(1)); Checkpoint* checkpoint; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint)); ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); ASSERT_NE(metadata_ptr_, nullptr); delete checkpoint; // Create a new db and import the files. - DB* db_copy; + std::unique_ptr db_copy; ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); ColumnFamilyHandle* cfh = nullptr; @@ -504,7 +504,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { } ASSERT_OK(db_copy->DropColumnFamily(cfh)); ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh)); - delete db_copy; + db_copy.reset(); ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); } @@ -529,7 +529,7 @@ TEST_F(ImportColumnFamilyTest, ASSERT_OK(db_->DeleteRange(WriteOptions(), handles_[1], Key(0), Key(2))); Checkpoint* checkpoint; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint)); ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); ASSERT_NE(metadata_ptr_, nullptr); @@ -605,14 +605,14 @@ TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) { ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); Checkpoint* checkpoint; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint)); ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); ASSERT_NE(metadata_ptr_, nullptr); delete checkpoint; // Create a new db and import the files. - DB* db_copy; + std::unique_ptr db_copy; ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); ColumnFamilyHandle* cfh = nullptr; @@ -627,7 +627,7 @@ TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) { } ASSERT_OK(db_copy->DropColumnFamily(cfh)); ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh)); - delete db_copy; + db_copy.reset(); ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); for (const Snapshot* snapshot : snapshots) { db_->ReleaseSnapshot(snapshot); @@ -771,12 +771,12 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) { Checkpoint* checkpoint1; Checkpoint* checkpoint2; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint1)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1)); ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); // Create a new db and import the files. - DB* db_copy; + std::unique_ptr db_copy; ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); ColumnFamilyHandle* copy_cfh = nullptr; @@ -796,7 +796,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) { ASSERT_OK(db_copy->Flush(FlushOptions())); // Flush again to create another L0 file. It should have higher sequencer. - ASSERT_OK(Checkpoint::Create(db_copy, &checkpoint2)); + ASSERT_OK(Checkpoint::Create(db_copy.get(), &checkpoint2)); ASSERT_OK(checkpoint2->ExportColumnFamily(copy_cfh, export_files_dir2_, &metadata_ptr2_)); @@ -826,7 +826,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) { ASSERT_OK(db_copy->DropColumnFamily(copy_cfh)); ASSERT_OK(db_copy->DestroyColumnFamilyHandle(copy_cfh)); - delete db_copy; + db_copy.reset(); ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); } @@ -840,12 +840,12 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) { Checkpoint* checkpoint1; Checkpoint* checkpoint2; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint1)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1)); ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); // Create a new db and import the files. - DB* db_copy; + std::unique_ptr db_copy; ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); ColumnFamilyHandle* copy_cfh = nullptr; @@ -857,7 +857,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) { ASSERT_OK(db_copy->Flush(FlushOptions())); // Flush again to create another L0 file. It should have higher sequencer. - ASSERT_OK(Checkpoint::Create(db_copy, &checkpoint2)); + ASSERT_OK(Checkpoint::Create(db_copy.get(), &checkpoint2)); ASSERT_OK(checkpoint2->ExportColumnFamily(copy_cfh, export_files_dir2_, &metadata_ptr2_)); @@ -877,7 +877,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) { ASSERT_OK(db_copy->DropColumnFamily(copy_cfh)); ASSERT_OK(db_copy->DestroyColumnFamilyHandle(copy_cfh)); - delete db_copy; + db_copy.reset(); ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); } @@ -1017,7 +1017,7 @@ TEST_F(ImportColumnFamilyTest, AssignEpochNumberToMultipleCF) { // corruption where two L0 files can have the same epoch number but // with overlapping key range. Checkpoint* checkpoint1; - ASSERT_OK(Checkpoint::Create(db_, &checkpoint1)); + ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1)); ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_, &metadata_ptr_)); ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[2], export_files_dir2_, diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 8e8e6d27ef10..6b2d75385ba4 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -45,6 +45,8 @@ const std::map InternalStats::compaction_level_stats = {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}}, {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}}, {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}}, + {LevelStatType::WRITE_PRE_COMP_GB, + LevelStat{"WPreCompGB", "WPreComp(GB)"}}, {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}}, {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}}, {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}}, @@ -100,19 +102,20 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name, int line_size = snprintf( buf + written_size, len - written_size, "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s " + "%s " "%s\n", // Note that we skip COMPACTED_FILES and merge it with Files column group_by.c_str(), hdr(LevelStatType::NUM_FILES), hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE), hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB), hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB), - hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB), - hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS), - hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC), - hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT), - hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN), - hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB), - hdr(LevelStatType::W_BLOB_GB)); + hdr(LevelStatType::WRITE_PRE_COMP_GB), hdr(LevelStatType::W_NEW_GB), + hdr(LevelStatType::MOVED_GB), hdr(LevelStatType::WRITE_AMP), + hdr(LevelStatType::READ_MBPS), hdr(LevelStatType::WRITE_MBPS), + hdr(LevelStatType::COMP_SEC), hdr(LevelStatType::COMP_CPU_SEC), + hdr(LevelStatType::COMP_COUNT), hdr(LevelStatType::AVG_SEC), + hdr(LevelStatType::KEY_IN), hdr(LevelStatType::KEY_DROP), + hdr(LevelStatType::R_BLOB_GB), hdr(LevelStatType::W_BLOB_GB)); written_size += line_size; written_size = std::min(written_size, static_cast(len)); @@ -140,6 +143,8 @@ void PrepareLevelStats(std::map* level_stats, stats.bytes_read_non_output_levels / kGB; (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB; (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB; + (*level_stats)[LevelStatType::WRITE_PRE_COMP_GB] = + stats.bytes_written_pre_comp / kGB; (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB; (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB; (*level_stats)[LevelStatType::WRITE_AMP] = w_amp; @@ -164,12 +169,13 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, buf, len, "%4s " /* Level */ "%6d/%-3d " /* Files */ - "%8s " /* Size */ + "%10s " /* Size */ "%5.1f " /* Score */ "%8.1f " /* Read(GB) */ "%7.1f " /* Rn(GB) */ "%8.1f " /* Rnp1(GB) */ "%9.1f " /* Write(GB) */ + "%9.1f " /* WPreComp(GB) */ "%8.1f " /* Wnew(GB) */ "%9.1f " /* Moved(GB) */ "%5.1f " /* W-Amp */ @@ -193,6 +199,7 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, stat_value.at(LevelStatType::RN_GB), stat_value.at(LevelStatType::RNP1_GB), stat_value.at(LevelStatType::WRITE_GB), + stat_value.at(LevelStatType::WRITE_PRE_COMP_GB), stat_value.at(LevelStatType::W_NEW_GB), stat_value.at(LevelStatType::MOVED_GB), stat_value.at(LevelStatType::WRITE_AMP), @@ -303,6 +310,7 @@ static const std::string aggregated_table_properties_at_level = static const std::string num_running_compactions = "num-running-compactions"; static const std::string num_running_compaction_sorted_runs = "num-running-compaction-sorted-runs"; +static const std::string compaction_abort_count = "compaction-abort-count"; static const std::string num_running_flushes = "num-running-flushes"; static const std::string actual_delayed_write_rate = "actual-delayed-write-rate"; @@ -355,6 +363,8 @@ const std::string DB::Properties::kNumRunningCompactions = rocksdb_prefix + num_running_compactions; const std::string DB::Properties::kNumRunningCompactionSortedRuns = rocksdb_prefix + num_running_compaction_sorted_runs; +const std::string DB::Properties::kCompactionAbortCount = + rocksdb_prefix + compaction_abort_count; const std::string DB::Properties::kNumRunningFlushes = rocksdb_prefix + num_running_flushes; const std::string DB::Properties::kBackgroundErrors = @@ -587,6 +597,9 @@ const UnorderedMap {DB::Properties::kNumRunningCompactionSortedRuns, {false, nullptr, &InternalStats::HandleNumRunningCompactionSortedRuns, nullptr, nullptr}}, + {DB::Properties::kCompactionAbortCount, + {false, nullptr, &InternalStats::HandleCompactionAbortCount, nullptr, + nullptr}}, {DB::Properties::kActualDelayedWriteRate, {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr, nullptr}}, @@ -1285,6 +1298,13 @@ bool InternalStats::HandleNumRunningCompactionSortedRuns(uint64_t* value, return true; } +bool InternalStats::HandleCompactionAbortCount(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast( + db->compaction_aborted_.load(std::memory_order_acquire)); + return true; +} + bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { // Accumulated number of errors in background flushes or compactions. diff --git a/db/internal_stats.h b/db/internal_stats.h index 7ebd406db757..347b3a617aae 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -71,6 +71,7 @@ enum class LevelStatType { RN_GB, RNP1_GB, WRITE_GB, + WRITE_PRE_COMP_GB, W_NEW_GB, MOVED_GB, WRITE_AMP, @@ -153,23 +154,6 @@ class InternalStats { InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd); - // Per level compaction stats - struct CompactionOutputsStats { - uint64_t num_output_records = 0; - uint64_t bytes_written = 0; - uint64_t bytes_written_blob = 0; - uint64_t num_output_files = 0; - uint64_t num_output_files_blob = 0; - - void Add(const CompactionOutputsStats& stats) { - this->num_output_records += stats.num_output_records; - this->bytes_written += stats.bytes_written; - this->bytes_written_blob += stats.bytes_written_blob; - this->num_output_files += stats.num_output_files; - this->num_output_files_blob += stats.num_output_files_blob; - } - }; - // Per level compaction stats. comp_stats_[level] stores the stats for // compactions that produced data for the specified "level". struct CompactionStats { @@ -196,6 +180,9 @@ class InternalStats { // Total number of bytes written to table files during compaction uint64_t bytes_written; + // Total number of bytes written pre-compression during compaction + uint64_t bytes_written_pre_comp; + // Total number of bytes written to blob files during compaction uint64_t bytes_written_blob; @@ -248,6 +235,7 @@ class InternalStats { bytes_skipped_output_level(0), bytes_read_blob(0), bytes_written(0), + bytes_written_pre_comp(0), bytes_written_blob(0), bytes_moved(0), num_input_files_in_non_output_levels(0), @@ -275,6 +263,7 @@ class InternalStats { bytes_skipped_output_level(0), bytes_read_blob(0), bytes_written(0), + bytes_written_pre_comp(0), bytes_written_blob(0), bytes_moved(0), num_input_files_in_non_output_levels(0), @@ -308,6 +297,7 @@ class InternalStats { bytes_skipped_output_level(c.bytes_skipped_output_level), bytes_read_blob(c.bytes_read_blob), bytes_written(c.bytes_written), + bytes_written_pre_comp(c.bytes_written_pre_comp), bytes_written_blob(c.bytes_written_blob), bytes_moved(c.bytes_moved), num_input_files_in_non_output_levels( @@ -338,6 +328,7 @@ class InternalStats { bytes_skipped_output_level = c.bytes_skipped_output_level; bytes_read_blob = c.bytes_read_blob; bytes_written = c.bytes_written; + bytes_written_pre_comp = c.bytes_written_pre_comp; bytes_written_blob = c.bytes_written_blob; bytes_moved = c.bytes_moved; num_input_files_in_non_output_levels = @@ -370,6 +361,7 @@ class InternalStats { this->bytes_skipped_output_level = 0; this->bytes_read_blob = 0; this->bytes_written = 0; + this->bytes_written_pre_comp = 0; this->bytes_written_blob = 0; this->bytes_moved = 0; this->num_input_files_in_non_output_levels = 0; @@ -398,6 +390,7 @@ class InternalStats { this->bytes_skipped_output_level += c.bytes_skipped_output_level; this->bytes_read_blob += c.bytes_read_blob; this->bytes_written += c.bytes_written; + this->bytes_written_pre_comp += c.bytes_written_pre_comp; this->bytes_written_blob += c.bytes_written_blob; this->bytes_moved += c.bytes_moved; this->num_input_files_in_non_output_levels += @@ -420,15 +413,6 @@ class InternalStats { } } - void Add(const CompactionOutputsStats& stats) { - this->num_output_files += static_cast(stats.num_output_files); - this->num_output_records += stats.num_output_records; - this->bytes_written += stats.bytes_written; - this->bytes_written_blob += stats.bytes_written_blob; - this->num_output_files_blob += - static_cast(stats.num_output_files_blob); - } - void Subtract(const CompactionStats& c) { this->micros -= c.micros; this->cpu_micros -= c.cpu_micros; @@ -439,6 +423,7 @@ class InternalStats { this->bytes_skipped_output_level -= c.bytes_skipped_output_level; this->bytes_read_blob -= c.bytes_read_blob; this->bytes_written -= c.bytes_written; + this->bytes_written_pre_comp -= c.bytes_written_pre_comp; this->bytes_written_blob -= c.bytes_written_blob; this->bytes_moved -= c.bytes_moved; this->num_input_files_in_non_output_levels -= @@ -473,49 +458,51 @@ class InternalStats { } }; - // Compaction stats, for per_key_placement compaction, it includes 2 levels - // stats: the last level and the penultimate level. + // Compaction internal stats, for per_key_placement compaction, it includes 2 + // output level stats: the last level and the proximal level. struct CompactionStatsFull { // the stats for the target primary output level - CompactionStats stats; + CompactionStats output_level_stats; - // stats for penultimate level output if exist - bool has_penultimate_level_output = false; - CompactionStats penultimate_level_stats; + // stats for proximal level output if exist + bool has_proximal_level_output = false; + CompactionStats proximal_level_stats; - explicit CompactionStatsFull() : stats(), penultimate_level_stats() {} + explicit CompactionStatsFull() + : output_level_stats(), proximal_level_stats() {} explicit CompactionStatsFull(CompactionReason reason, int c) - : stats(reason, c), penultimate_level_stats(reason, c) {} + : output_level_stats(reason, c), proximal_level_stats(reason, c) {} uint64_t TotalBytesWritten() const { - uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; - if (has_penultimate_level_output) { - bytes_written += penultimate_level_stats.bytes_written + - penultimate_level_stats.bytes_written_blob; + uint64_t bytes_written = output_level_stats.bytes_written + + output_level_stats.bytes_written_blob; + if (has_proximal_level_output) { + bytes_written += proximal_level_stats.bytes_written + + proximal_level_stats.bytes_written_blob; } return bytes_written; } uint64_t DroppedRecords() { - uint64_t output_records = stats.num_output_records; - if (has_penultimate_level_output) { - output_records += penultimate_level_stats.num_output_records; + uint64_t output_records = output_level_stats.num_output_records; + if (has_proximal_level_output) { + output_records += proximal_level_stats.num_output_records; } - if (stats.num_input_records > output_records) { - return stats.num_input_records - output_records; + if (output_level_stats.num_input_records > output_records) { + return output_level_stats.num_input_records - output_records; } return 0; } void SetMicros(uint64_t val) { - stats.micros = val; - penultimate_level_stats.micros = val; + output_level_stats.micros = val; + proximal_level_stats.micros = val; } void AddCpuMicros(uint64_t val) { - stats.cpu_micros += val; - penultimate_level_stats.cpu_micros += val; + output_level_stats.cpu_micros += val; + proximal_level_stats.cpu_micros += val; } }; @@ -587,10 +574,9 @@ class InternalStats { void AddCompactionStats(int level, Env::Priority thread_pri, const CompactionStatsFull& comp_stats_full) { - AddCompactionStats(level, thread_pri, comp_stats_full.stats); - if (comp_stats_full.has_penultimate_level_output) { - per_key_placement_comp_stats_.Add( - comp_stats_full.penultimate_level_stats); + AddCompactionStats(level, thread_pri, comp_stats_full.output_level_stats); + if (comp_stats_full.has_proximal_level_output) { + per_key_placement_comp_stats_.Add(comp_stats_full.proximal_level_stats); } } @@ -722,7 +708,10 @@ class InternalStats { // a full cache, which would force a re-scan on the next GetStats. std::shared_ptr> cache_entry_stats_collector_; - // Per-ColumnFamily/level compaction stats + + // Per-column family and level compaction statistics, including flush and file + // ingestion. These are treated as compactions to L0 or the level where the + // file was ingested. std::vector comp_stats_; std::vector comp_stats_by_pri_; CompactionStats per_key_placement_comp_stats_; @@ -863,6 +852,8 @@ class InternalStats { Version* version); bool HandleNumRunningCompactionSortedRuns(uint64_t* value, DBImpl* db, Version* version); + bool HandleCompactionAbortCount(uint64_t* value, DBImpl* db, + Version* version); bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version); bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, Version* version); diff --git a/db/job_context.h b/db/job_context.h index 83e9f5facafd..365a820d5f48 100644 --- a/db/job_context.h +++ b/db/job_context.h @@ -22,6 +22,9 @@ namespace ROCKSDB_NAMESPACE { class MemTable; struct SuperVersion; +// The purpose of this struct is to simplify pushing work such as +// allocation/construction, de-allocation/destruction, and notifications to +// outside of holding the DB mutex. struct SuperVersionContext { struct WriteStallNotification { WriteStallInfo write_stall_info; @@ -35,12 +38,6 @@ struct SuperVersionContext { std::unique_ptr new_superversion; // if nullptr no new superversion - // If not nullptr, a new seqno to time mapping is available to be installed. - // Otherwise, make a shared copy of the one in the existing SuperVersion and - // carry it over to the new SuperVersion. This is moved to the SuperVersion - // during installation. - std::shared_ptr new_seqno_to_time_mapping{nullptr}; - explicit SuperVersionContext(bool create_superversion = false) : new_superversion(create_superversion ? new SuperVersion() : nullptr) {} @@ -126,7 +123,7 @@ struct JobContext { break; } } - return memtables_to_free.size() > 0 || logs_to_free.size() > 0 || + return memtables_to_free.size() > 0 || wals_to_free.size() > 0 || job_snapshot != nullptr || sv_have_sth; } @@ -138,6 +135,37 @@ struct JobContext { return kMaxSequenceNumber; } + SequenceNumber GetLatestSnapshotSequence() const { + assert(snapshot_context_initialized); + if (snapshot_seqs.empty()) { + return 0; + } + return snapshot_seqs.back(); + } + + SequenceNumber GetEarliestSnapshotSequence() const { + assert(snapshot_context_initialized); + if (snapshot_seqs.empty()) { + return kMaxSequenceNumber; + } + return snapshot_seqs.front(); + } + + void InitSnapshotContext(SnapshotChecker* checker, + std::unique_ptr managed_snapshot, + SequenceNumber earliest_write_conflict, + std::vector&& snapshots) { + if (snapshot_context_initialized) { + return; + } + snapshot_context_initialized = true; + snapshot_checker = checker; + assert(!job_snapshot); + job_snapshot = std::move(managed_snapshot); + earliest_write_conflict_snapshot = earliest_write_conflict; + snapshot_seqs = std::move(snapshots); + } + // Structure to store information for candidate files to delete. struct CandidateFileInfo { std::string file_name; @@ -149,9 +177,6 @@ struct JobContext { } }; - // Unique job id - int job_id; - // a list of all files that we'll consider deleting // (every once in a while this is filled up with all files // in the DB directory) @@ -196,37 +221,47 @@ struct JobContext { // contexts for installing superversions for multiple column families std::vector superversion_contexts; - autovector logs_to_free; + autovector wals_to_free; // the current manifest_file_number, log_number and prev_log_number // that corresponds to the set of files in 'live'. - uint64_t manifest_file_number; - uint64_t pending_manifest_file_number; + uint64_t manifest_file_number = 0; + uint64_t pending_manifest_file_number = 0; // Used for remote compaction. To prevent OPTIONS files from getting // purged by PurgeObsoleteFiles() of the primary host uint64_t min_options_file_number; - uint64_t log_number; - uint64_t prev_log_number; + uint64_t log_number = 0; + uint64_t prev_log_number = 0; uint64_t min_pending_output = 0; - uint64_t prev_total_log_size = 0; - size_t num_alive_log_files = 0; + uint64_t prev_wals_total_size = 0; + size_t num_alive_wal_files = 0; uint64_t size_log_to_delete = 0; // Snapshot taken before flush/compaction job. std::unique_ptr job_snapshot; + SnapshotChecker* snapshot_checker = nullptr; + std::vector snapshot_seqs; + // This is the earliest snapshot that could be used for write-conflict + // checking by a transaction. For any user-key newer than this snapshot, we + // should make sure not to remove evidence that a write occurred. + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber; + + // Unique job id + int job_id; + + bool snapshot_context_initialized = false; explicit JobContext(int _job_id, bool create_superversion = false) { job_id = _job_id; - manifest_file_number = 0; - pending_manifest_file_number = 0; - log_number = 0; - prev_log_number = 0; superversion_contexts.emplace_back( SuperVersionContext(create_superversion)); } + // Delete the default constructor + JobContext() = delete; + // For non-empty JobContext Clean() has to be called at least once before // before destruction (see asserts in ~JobContext()). Should be called with // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally @@ -240,18 +275,18 @@ struct JobContext { for (auto m : memtables_to_free) { delete m; } - for (auto l : logs_to_free) { + for (auto l : wals_to_free) { delete l; } memtables_to_free.clear(); - logs_to_free.clear(); + wals_to_free.clear(); job_snapshot.reset(); } ~JobContext() { assert(memtables_to_free.size() == 0); - assert(logs_to_free.size() == 0); + assert(wals_to_free.size() == 0); } }; diff --git a/db/listener_test.cc b/db/listener_test.cc index bfd5953668ff..989de3583c7b 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -105,7 +105,7 @@ class TestCompactionListener : public EventListener { ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size()); ASSERT_TRUE(test_); - ASSERT_EQ(test_->db_, db); + ASSERT_EQ(test_->db_.get(), db); std::vector> files_by_level; test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id], @@ -163,9 +163,7 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { options.max_bytes_for_level_base = options.target_file_size_base * 2; options.max_bytes_for_level_multiplier = 2; options.compression = kNoCompression; -#ifdef ROCKSDB_USING_THREAD_STATUS - options.enable_thread_tracking = true; -#endif // ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = ThreadStatus::kEnabled; options.level0_file_num_compaction_trigger = kNumL0Files; options.table_properties_collector_factories.push_back( std::make_shared()); @@ -199,7 +197,7 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size()); for (size_t i = 0; i < cf_names.size(); ++i) { - ASSERT_EQ(listener->compacted_dbs_[i], db_); + ASSERT_EQ(listener->compacted_dbs_[i], db_.get()); } } @@ -229,7 +227,7 @@ class TestFlushListener : public EventListener { ASSERT_EQ(info.file_checksum, kUnknownFileChecksum); ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS // Verify the id of the current thread that created this table // file matches the id of any active flush or compaction thread. uint64_t thread_id = env_->GetThreadID(); @@ -246,7 +244,7 @@ class TestFlushListener : public EventListener { } } ASSERT_TRUE(found_match); -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS } void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { @@ -270,7 +268,7 @@ class TestFlushListener : public EventListener { // that assumption does not hold (see the test case MultiDBMultiListeners // below). ASSERT_TRUE(test_); - if (db == test_->db_) { + if (db == test_->db_.get()) { std::vector> files_by_level; ASSERT_LT(info.cf_id, test_->handles_.size()); ASSERT_GE(info.cf_id, 0u); @@ -310,9 +308,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { Options options; options.env = CurrentOptions().env; options.write_buffer_size = k110KB; -#ifdef ROCKSDB_USING_THREAD_STATUS - options.enable_thread_tracking = true; -#endif // ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = ThreadStatus::kEnabled; TestFlushListener* listener = new TestFlushListener(options.env, this); options.listeners.emplace_back(listener); std::vector cf_names = {"pikachu", "ilya", "muromec", @@ -347,7 +343,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { // make sure callback functions are called in the right order for (size_t i = 0; i < cf_names.size(); ++i) { - ASSERT_EQ(listener->flushed_dbs_[i], db_); + ASSERT_EQ(listener->flushed_dbs_[i], db_.get()); ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]); } } @@ -357,9 +353,7 @@ TEST_F(EventListenerTest, MultiCF) { Options options; options.env = CurrentOptions().env; options.write_buffer_size = k110KB; -#ifdef ROCKSDB_USING_THREAD_STATUS - options.enable_thread_tracking = true; -#endif // ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = ThreadStatus::kEnabled; options.atomic_flush = atomic_flush; options.create_if_missing = true; DestroyAndReopen(options); @@ -393,7 +387,7 @@ TEST_F(EventListenerTest, MultiCF) { // make sure callback functions are called in the right order if (i == 7) { for (size_t j = 0; j < cf_names.size(); j++) { - ASSERT_EQ(listener->flushed_dbs_[j], db_); + ASSERT_EQ(listener->flushed_dbs_[j], db_.get()); ASSERT_EQ(listener->flushed_column_family_names_[j], cf_names[j]); } } @@ -407,9 +401,7 @@ TEST_F(EventListenerTest, MultiCF) { TEST_F(EventListenerTest, MultiDBMultiListeners) { Options options; options.env = CurrentOptions().env; -#ifdef ROCKSDB_USING_THREAD_STATUS - options.enable_thread_tracking = true; -#endif // ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = ThreadStatus::kEnabled; options.table_properties_collector_factories.push_back( std::make_shared()); std::vector listeners; @@ -430,22 +422,21 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { DBOptions db_opts(options); ColumnFamilyOptions cf_opts(options); - std::vector dbs; + std::vector> dbs; std::vector> vec_handles; for (int d = 0; d < kNumDBs; ++d) { ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options)); - DB* db; + ASSERT_OK( + DB::Open(options, dbname_ + std::to_string(d), &dbs.emplace_back())); std::vector handles; - ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db)); for (size_t c = 0; c < cf_names.size(); ++c) { ColumnFamilyHandle* handle; - ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle)); + ASSERT_OK(dbs.back()->CreateColumnFamily(cf_opts, cf_names[c], &handle)); handles.push_back(handle); } vec_handles.push_back(std::move(handles)); - dbs.push_back(db); } for (int d = 0; d < kNumDBs; ++d) { @@ -458,23 +449,23 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { for (size_t c = 0; c < cf_names.size(); ++c) { for (int d = 0; d < kNumDBs; ++d) { ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c])); - ASSERT_OK( - static_cast_with_check(dbs[d])->TEST_WaitForFlushMemTable()); + ASSERT_OK(static_cast_with_check(dbs[d].get()) + ->TEST_WaitForFlushMemTable()); } } for (int d = 0; d < kNumDBs; ++d) { // Ensure background work is fully finished including listener callbacks // before accessing listener state. - ASSERT_OK( - static_cast_with_check(dbs[d])->TEST_WaitForBackgroundWork()); + ASSERT_OK(static_cast_with_check(dbs[d].get()) + ->TEST_WaitForBackgroundWork()); } for (auto* listener : listeners) { int pos = 0; for (size_t c = 0; c < cf_names.size(); ++c) { for (int d = 0; d < kNumDBs; ++d) { - ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]); + ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d].get()); ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]); pos++; } @@ -489,17 +480,15 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { } vec_handles.clear(); - for (auto db : dbs) { - delete db; + for (auto& db : dbs) { + db.reset(); } } TEST_F(EventListenerTest, DisableBGCompaction) { Options options; options.env = CurrentOptions().env; -#ifdef ROCKSDB_USING_THREAD_STATUS - options.enable_thread_tracking = true; -#endif // ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = ThreadStatus::kEnabled; TestFlushListener* listener = new TestFlushListener(options.env, this); const int kCompactionTrigger = 1; const int kSlowdownTrigger = 5; @@ -537,6 +526,47 @@ TEST_F(EventListenerTest, DisableBGCompaction) { ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9); } +class TestNumInputFilesTotalInputBytesPouplatedInListener + : public EventListener { + public: + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + std::lock_guard lock(mutex_); + num_input_files = ci.stats.num_input_files; + total_num_of_bytes = ci.stats.total_input_bytes; + } + size_t num_input_files = 0; + size_t total_num_of_bytes = 0; + std::mutex mutex_; +}; + +TEST_F(EventListenerTest, NumInputFilesTotalBytesPopulated) { + Options options; + options.level_compaction_dynamic_level_bytes = false; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestNumInputFilesTotalInputBytesPouplatedInListener* listener = + new TestNumInputFilesTotalInputBytesPouplatedInListener(); + options.listeners.emplace_back(listener); + + options.level0_file_num_compaction_trigger = 4; + options.compaction_style = kCompactionStyleLevel; + + DestroyAndReopen(options); + Random rnd(301); + ASSERT_EQ(listener->num_input_files, 0); + ASSERT_EQ(listener->total_num_of_bytes, 0); + // Write 4 files in L0 + for (int i = 0; i < 4; i++) { + GenerateNewRandomFile(&rnd); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(listener->num_input_files, 4); + ASSERT_NE(listener->total_num_of_bytes, 0); +} + class TestCompactionReasonListener : public EventListener { public: void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { @@ -1278,16 +1308,21 @@ class BlobDBJobLevelEventListenerTest : public EventListener { explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test) : test_(test), call_count_(0) {} - const VersionStorageInfo* GetVersionStorageInfo() const { - VersionSet* const versions = test_->dbfull()->GetVersionSet(); + // NOTE: it's not safe to rely on test_->db_ for these functions because + // the DB may be in the process of closing when these are called, and the + // unique_ptr is set to nullptr before invoking ~DB() + + const VersionStorageInfo* GetVersionStorageInfo(DB* db) const { + DBImpl* db_impl = static_cast_with_check(db); + VersionSet* const versions = db_impl->GetVersionSet(); assert(versions); ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); EXPECT_NE(cfd, nullptr); - test_->dbfull()->TEST_LockMutex(); + db_impl->TEST_LockMutex(); Version* const current = cfd->current(); - test_->dbfull()->TEST_UnlockMutex(); + db_impl->TEST_UnlockMutex(); EXPECT_NE(current, nullptr); const VersionStorageInfo* const storage_info = current->storage_info(); @@ -1297,8 +1332,9 @@ class BlobDBJobLevelEventListenerTest : public EventListener { } void CheckBlobFileAdditions( + DB* db, const std::vector& blob_file_addition_infos) const { - const auto* vstorage = GetVersionStorageInfo(); + const auto* vstorage = GetVersionStorageInfo(db); EXPECT_FALSE(blob_file_addition_infos.empty()); @@ -1326,7 +1362,7 @@ class BlobDBJobLevelEventListenerTest : public EventListener { return result; } - void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { { std::lock_guard lock(mutex_); IncreaseCallCount(/*mutex_locked*/ true); @@ -1335,16 +1371,15 @@ class BlobDBJobLevelEventListenerTest : public EventListener { EXPECT_EQ(info.blob_compression_type, kNoCompression); - CheckBlobFileAdditions(info.blob_file_addition_infos); + CheckBlobFileAdditions(db, info.blob_file_addition_infos); } - void OnCompactionCompleted(DB* /*db*/, - const CompactionJobInfo& info) override { + void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override { IncreaseCallCount(/*mutex_locked*/ false); EXPECT_EQ(info.blob_compression_type, kNoCompression); - CheckBlobFileAdditions(info.blob_file_addition_infos); + CheckBlobFileAdditions(db, info.blob_file_addition_infos); EXPECT_FALSE(info.blob_file_garbage_infos.empty()); diff --git a/db/log_reader.cc b/db/log_reader.cc index 0f0e25033ab5..2650b4c97a9a 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -95,7 +95,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, uint64_t prospective_record_offset = 0; Slice fragment; - while (true) { + for (;;) { uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); size_t drop_size = 0; const uint8_t record_type = @@ -140,7 +140,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, prospective_record_offset = physical_record_offset; scratch->assign(fragment.data(), fragment.size()); in_fragmented_record = true; - break; + break; // switch case kMiddleType: case kRecyclableMiddleType: @@ -153,7 +153,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } scratch->append(fragment.data(), fragment.size()); } - break; + break; // switch case kLastType: case kRecyclableLastType: @@ -171,7 +171,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, first_record_read_ = true; return true; } - break; + break; // switch case kSetCompressionType: { if (compression_type_record_read_) { @@ -193,7 +193,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } else { InitCompression(compression_record); } - break; + break; // switch } case kPredecessorWALInfoType: case kRecyclePredecessorWALInfoType: { @@ -210,7 +210,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, MaybeVerifyPredecessorWALInfo(wal_recovery_mode, fragment, recorded_predecessor_wal_info); } - break; + break; // switch } case kUserDefinedTimestampSizeType: case kRecyclableUserDefinedTimestampSizeType: { @@ -235,7 +235,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, ReportCorruption(fragment.size(), s.getState()); } } - break; + break; // switch } case kBadHeader: @@ -304,7 +304,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, in_fragmented_record = false; scratch->clear(); } - break; + break; // switch case kBadRecordLen: if (eof_) { @@ -337,7 +337,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, in_fragmented_record = false; scratch->clear(); } - break; + break; // switch default: { if ((record_type & kRecordTypeSafeIgnoreMask) == 0) { @@ -349,11 +349,11 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } in_fragmented_record = false; scratch->clear(); - break; + break; // switch } } } - return false; + // unreachable } void Reader::MaybeVerifyPredecessorWALInfo( @@ -380,8 +380,11 @@ void Reader::MaybeVerifyPredecessorWALInfo( } else { if (observed_predecessor_wal_info_.GetLogNumber() != recorded_predecessor_log_number) { - std::string reason = "Missing WAL of log number " + - std::to_string(recorded_predecessor_log_number); + std::string reason = + "Mismatched predecessor log number of WAL file " + + file_->file_name() + " Recorded " + + std::to_string(recorded_predecessor_log_number) + ". Observed " + + std::to_string(observed_predecessor_wal_info_.GetLogNumber()); ReportCorruption(fragment.size(), reason.c_str(), recorded_predecessor_log_number); } else if (observed_predecessor_wal_info_.GetLastSeqnoRecorded() != diff --git a/db/log_reader.h b/db/log_reader.h index dfcd6b7690f3..b2c43f076414 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -59,7 +59,7 @@ class Reader { // live while this Reader is in use. // // If "checksum" is true, verify checksums if available. - // TODO(hx235): seperate WAL related parameters from general `Reader` + // TODO(hx235): separate WAL related parameters from general `Reader` // parameters Reader(std::shared_ptr info_log, std::unique_ptr&& file, Reporter* reporter, @@ -155,7 +155,7 @@ class Reader { // which log number this is uint64_t const log_number_; - // See `Optinos::track_and_verify_wals` + // See `Options::track_and_verify_wals` bool track_and_verify_wals_; // Below variables are used for WAL verification // TODO(hx235): To revise `stop_replay_for_corruption_` inside `LogReader` @@ -208,8 +208,8 @@ class Reader { }; // Return type, or one of the preceding special values - // If WAL compressioned is enabled, fragment_checksum is the checksum of the - // fragment computed from the orginal buffer containinng uncompressed + // If WAL compression is enabled, fragment_checksum is the checksum of the + // fragment computed from the original buffer containing uncompressed // fragment. uint8_t ReadPhysicalRecord(Slice* result, size_t* drop_size, uint64_t* fragment_checksum = nullptr); diff --git a/db/log_writer.h b/db/log_writer.h index f7aef75197d5..3a76faab771b 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -77,7 +77,7 @@ class Writer { // Create a writer that will append data to "*dest". // "*dest" must be initially empty. // "*dest" must remain live while this Writer is in use. - // TODO(hx235): seperate WAL related parameters from general `Reader` + // TODO(hx235): separate WAL related parameters from general `Reader` // parameters explicit Writer(std::unique_ptr&& dest, uint64_t log_number, bool recycle_log_files, diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index e84031065426..d740c7d2d630 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -98,7 +98,7 @@ class LogCompactionFilter : public CompactionFilter { TEST_F(ManualCompactionTest, CompactTouchesAllKeys) { for (int iter = 0; iter < 2; ++iter) { - DB* db; + std::unique_ptr db; Options options; if (iter == 0) { // level compaction options.num_levels = 3; @@ -128,7 +128,7 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) { delete itr; delete options.compaction_filter; - delete db; + db.reset(); ASSERT_OK(DestroyDB(dbname_, options)); } } @@ -137,7 +137,7 @@ TEST_F(ManualCompactionTest, Test) { // Open database. Disable compression since it affects the creation // of layers and the code below is trying to test against a very // specific scenario. - DB* db; + std::unique_ptr db; Options db_options; db_options.write_buffer_size = 1024; db_options.create_if_missing = true; @@ -185,12 +185,12 @@ TEST_F(ManualCompactionTest, Test) { ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys"; // close database - delete db; + db.reset(); ASSERT_OK(DestroyDB(dbname_, Options())); } TEST_F(ManualCompactionTest, SkipLevel) { - DB* db; + std::unique_ptr db; Options options; options.level_compaction_dynamic_level_bytes = false; options.num_levels = 3; @@ -298,7 +298,7 @@ TEST_F(ManualCompactionTest, SkipLevel) { } delete filter; - delete db; + db.reset(); ASSERT_OK(DestroyDB(dbname_, options)); } diff --git a/db/memtable.cc b/db/memtable.cc index 5f5450276b38..7a2b0fe6880a 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -70,7 +70,9 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( protection_bytes_per_key( mutable_cf_options.memtable_protection_bytes_per_key), allow_data_in_errors(ioptions.allow_data_in_errors), - paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {} + paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks), + memtable_veirfy_per_key_checksum_on_seek( + mutable_cf_options.memtable_veirfy_per_key_checksum_on_seek) {} MemTable::MemTable(const InternalKeyComparator& cmp, const ImmutableOptions& ioptions, @@ -115,7 +117,13 @@ MemTable::MemTable(const InternalKeyComparator& cmp, oldest_key_time_(std::numeric_limits::max()), approximate_memory_usage_(0), memtable_max_range_deletions_( - mutable_cf_options.memtable_max_range_deletions) { + mutable_cf_options.memtable_max_range_deletions), + key_validation_callback_( + (moptions_.protection_bytes_per_key != 0 && + moptions_.memtable_veirfy_per_key_checksum_on_seek) + ? std::bind(&MemTable::ValidateKey, this, std::placeholders::_1, + std::placeholders::_2) + : std::function(nullptr)) { UpdateFlushState(); // something went wrong if we need to flush before inserting anything assert(!ShouldScheduleFlush()); @@ -134,6 +142,16 @@ MemTable::MemTable(const InternalKeyComparator& cmp, auto new_cache = std::make_shared(); size_t size = cached_range_tombstone_.Size(); for (size_t i = 0; i < size; ++i) { +#if defined(__cpp_lib_atomic_shared_ptr) + std::atomic>* + local_cache_ref_ptr = cached_range_tombstone_.AccessAtCore(i); + auto new_local_cache_ref = std::make_shared< + const std::shared_ptr>(new_cache); + std::shared_ptr aliased_ptr( + new_local_cache_ref, new_cache.get()); + local_cache_ref_ptr->store(std::move(aliased_ptr), + std::memory_order_relaxed); +#else std::shared_ptr* local_cache_ref_ptr = cached_range_tombstone_.AccessAtCore(i); auto new_local_cache_ref = std::make_shared< @@ -143,11 +161,11 @@ MemTable::MemTable(const InternalKeyComparator& cmp, std::shared_ptr(new_local_cache_ref, new_cache.get()), std::memory_order_relaxed); +#endif } const Comparator* ucmp = cmp.user_comparator(); assert(ucmp); ts_sz_ = ucmp->timestamp_size(); - persist_user_defined_timestamps_ = ioptions.persist_user_defined_timestamps; } MemTable::~MemTable() { @@ -169,21 +187,26 @@ size_t MemTable::ApproximateMemoryUsage() { } total_usage += usage; } - approximate_memory_usage_.store(total_usage, std::memory_order_relaxed); + approximate_memory_usage_.StoreRelaxed(total_usage); // otherwise, return the actual usage return total_usage; } bool MemTable::ShouldFlushNow() { + if (IsMarkedForFlush()) { + // TODO: dedicated flush reason when marked for flush + return true; + } + // This is set if memtable_max_range_deletions is > 0, // and that many range deletions are done if (memtable_max_range_deletions_ > 0 && - num_range_deletes_.load(std::memory_order_relaxed) >= + num_range_deletes_.LoadRelaxed() >= static_cast(memtable_max_range_deletions_)) { return true; } - size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed); + size_t write_buffer_size = write_buffer_size_.LoadRelaxed(); // In a lot of times, we cannot allocate arena blocks that exactly matches the // buffer size. Thus we have to decide if we should over-allocate or // under-allocate. @@ -192,13 +215,14 @@ bool MemTable::ShouldFlushNow() { // allocate one more block. const double kAllowOverAllocationRatio = 0.6; + // range deletion use skip list which allocates all memeory through `arena_` + assert(range_del_table_->ApproximateMemoryUsage() == 0); // If arena still have room for new block allocation, we can safely say it // shouldn't flush. - auto allocated_memory = table_->ApproximateMemoryUsage() + - range_del_table_->ApproximateMemoryUsage() + - arena_.MemoryAllocatedBytes(); + auto allocated_memory = + table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes(); - approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed); + approximate_memory_usage_.StoreRelaxed(allocated_memory); // if we can still allocate one more block without exceeding the // over-allocation ratio, then we should not flush. @@ -378,7 +402,11 @@ class MemTableIterator : public InternalIterator { !mem.GetImmutableMemTableOptions()->inplace_update_support), arena_mode_(arena != nullptr), paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks), - allow_data_in_error(mem.moptions_.allow_data_in_errors) { + validate_on_seek_( + mem.moptions_.paranoid_memory_checks || + mem.moptions_.memtable_veirfy_per_key_checksum_on_seek), + allow_data_in_error_(mem.moptions_.allow_data_in_errors), + key_validation_callback_(mem.key_validation_callback_) { if (kind == kRangeDelEntries) { iter_ = mem.range_del_table_->GetIterator(arena); } else if (prefix_extractor_ != nullptr && @@ -447,8 +475,10 @@ class MemTableIterator : public InternalIterator { } } } - if (paranoid_memory_checks_) { - status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error); + if (validate_on_seek_) { + status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error_, + paranoid_memory_checks_, + key_validation_callback_); } else { iter_->Seek(k, nullptr); } @@ -472,8 +502,10 @@ class MemTableIterator : public InternalIterator { } } } - if (paranoid_memory_checks_) { - status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error); + if (validate_on_seek_) { + status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error_, + paranoid_memory_checks_, + key_validation_callback_); } else { iter_->Seek(k, nullptr); } @@ -502,7 +534,7 @@ class MemTableIterator : public InternalIterator { PERF_COUNTER_ADD(next_on_memtable_count, 1); assert(Valid()); if (paranoid_memory_checks_) { - status_ = iter_->NextAndValidate(allow_data_in_error); + status_ = iter_->NextAndValidate(allow_data_in_error_); } else { iter_->Next(); TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); @@ -524,7 +556,7 @@ class MemTableIterator : public InternalIterator { PERF_COUNTER_ADD(prev_on_memtable_count, 1); assert(Valid()); if (paranoid_memory_checks_) { - status_ = iter_->PrevAndValidate(allow_data_in_error); + status_ = iter_->PrevAndValidate(allow_data_in_error_); } else { iter_->Prev(); } @@ -583,7 +615,9 @@ class MemTableIterator : public InternalIterator { bool value_pinned_; bool arena_mode_; const bool paranoid_memory_checks_; - const bool allow_data_in_error; + const bool validate_on_seek_; + const bool allow_data_in_error_; + const std::function key_validation_callback_; void VerifyEntryChecksum() { if (protection_bytes_per_key_ > 0 && Valid()) { @@ -740,7 +774,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( const ReadOptions& read_options, SequenceNumber read_seq, bool immutable_memtable) { if (read_options.ignore_range_deletions || - is_range_del_table_empty_.load(std::memory_order_relaxed)) { + is_range_del_table_empty_.LoadRelaxed()) { return nullptr; } return NewRangeTombstoneIteratorInternal(read_options, read_seq, @@ -751,7 +785,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewTimestampStrippingRangeTombstoneIterator( const ReadOptions& read_options, SequenceNumber read_seq, size_t ts_sz) { if (read_options.ignore_range_deletions || - is_range_del_table_empty_.load(std::memory_order_relaxed)) { + is_range_del_table_empty_.LoadRelaxed()) { return nullptr; } if (!timestamp_stripping_fragmented_range_tombstone_list_) { @@ -785,8 +819,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( // takes current cache std::shared_ptr cache = +#if defined(__cpp_lib_atomic_shared_ptr) + cached_range_tombstone_.Access()->load(std::memory_order_relaxed) +#else std::atomic_load_explicit(cached_range_tombstone_.Access(), - std::memory_order_relaxed); + std::memory_order_relaxed) +#endif + ; // construct fragmented tombstone list if necessary if (!cache->initialized.load(std::memory_order_acquire)) { cache->reader_mutex.lock(); @@ -810,7 +849,7 @@ void MemTable::ConstructFragmentedRangeTombstones() { // There should be no concurrent Construction. // We could also check fragmented_range_tombstone_list_ to avoid repeate // constructions. We just construct them here again to be safe. - if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { + if (!is_range_del_table_empty_.LoadRelaxed()) { // TODO: plumb Env::IOActivity, Env::IOPriority auto* unfragmented_iter = new MemTableIterator( MemTableIterator::kRangeDelEntries, *this, ReadOptions()); @@ -833,7 +872,7 @@ ReadOnlyMemTable::MemTableStats MemTable::ApproximateStats( if (entry_count == 0) { return {0, 0}; } - uint64_t n = num_entries_.load(std::memory_order_relaxed); + uint64_t n = num_entries_.LoadRelaxed(); if (n == 0) { return {0, 0}; } @@ -843,7 +882,7 @@ ReadOnlyMemTable::MemTableStats MemTable::ApproximateStats( // the inaccuracy. entry_count = n; } - uint64_t data_size = data_size_.load(std::memory_order_relaxed); + uint64_t data_size = data_size_.LoadRelaxed(); return {entry_count * (data_size / n), entry_count}; } @@ -973,17 +1012,14 @@ Status MemTable::Add(SequenceNumber s, ValueType type, // this is a bit ugly, but is the way to avoid locked instructions // when incrementing an atomic - num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1, - std::memory_order_relaxed); - data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len, - std::memory_order_relaxed); + num_entries_.StoreRelaxed(num_entries_.LoadRelaxed() + 1); + data_size_.StoreRelaxed(data_size_.LoadRelaxed() + encoded_len); if (type == kTypeDeletion || type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp) { - num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1, - std::memory_order_relaxed); + num_deletes_.StoreRelaxed(num_deletes_.LoadRelaxed() + 1); } else if (type == kTypeRangeDeletion) { - uint64_t val = num_range_deletes_.load(std::memory_order_relaxed) + 1; - num_range_deletes_.store(val, std::memory_order_relaxed); + uint64_t val = num_range_deletes_.LoadRelaxed() + 1; + num_range_deletes_.StoreRelaxed(val); } if (bloom_filter_ && prefix_extractor_ && @@ -1054,6 +1090,16 @@ Status MemTable::Add(SequenceNumber s, ValueType type, range_del_mutex_.lock(); } for (size_t i = 0; i < size; ++i) { +#if defined(__cpp_lib_atomic_shared_ptr) + std::atomic>* + local_cache_ref_ptr = cached_range_tombstone_.AccessAtCore(i); + auto new_local_cache_ref = std::make_shared< + const std::shared_ptr>(new_cache); + std::shared_ptr aliased_ptr( + new_local_cache_ref, new_cache.get()); + local_cache_ref_ptr->store(std::move(aliased_ptr), + std::memory_order_relaxed); +#else std::shared_ptr* local_cache_ref_ptr = cached_range_tombstone_.AccessAtCore(i); auto new_local_cache_ref = std::make_shared< @@ -1068,12 +1114,13 @@ Status MemTable::Add(SequenceNumber s, ValueType type, std::shared_ptr( new_local_cache_ref, new_cache.get()), std::memory_order_relaxed); +#endif } if (allow_concurrent) { range_del_mutex_.unlock(); } - is_range_del_table_empty_.store(false, std::memory_order_relaxed); + is_range_del_table_empty_.StoreRelaxed(false); } UpdateOldestKeyTime(); @@ -1464,11 +1511,13 @@ void MemTable::GetFromTable(const LookupKey& key, saver.allow_data_in_errors = moptions_.allow_data_in_errors; saver.protection_bytes_per_key = moptions_.protection_bytes_per_key; - if (!moptions_.paranoid_memory_checks) { + if (!moptions_.paranoid_memory_checks && + !moptions_.memtable_veirfy_per_key_checksum_on_seek) { table_->Get(key, &saver, SaveValue); } else { - Status check_s = table_->GetAndValidate(key, &saver, SaveValue, - moptions_.allow_data_in_errors); + Status check_s = table_->GetAndValidate( + key, &saver, SaveValue, moptions_.allow_data_in_errors, + moptions_.paranoid_memory_checks, key_validation_callback_); if (check_s.IsCorruption()) { *(saver.status) = check_s; // Should stop searching the LSM. @@ -1479,6 +1528,11 @@ void MemTable::GetFromTable(const LookupKey& key, *seq = saver.seq; } +Status MemTable::ValidateKey(const char* key, bool allow_data_in_errors) { + return VerifyEntryChecksum(key, moptions_.protection_bytes_per_key, + allow_data_in_errors); +} + void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, ReadCallback* callback, bool immutable_memtable) { // The sequence number is updated synchronously in version_set.h @@ -1492,7 +1546,7 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, // range tombstones. This is the simplest way to ensure range tombstones are // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0 bool no_range_del = read_options.ignore_range_deletions || - is_range_del_table_empty_.load(std::memory_order_relaxed); + is_range_del_table_empty_.LoadRelaxed(); MultiGetRange temp_range(*range, range->begin(), range->end()); if (bloom_filter_ && no_range_del) { bool whole_key = @@ -1801,7 +1855,7 @@ uint64_t MemTable::GetMinLogContainingPrepSection() { } void MemTable::MaybeUpdateNewestUDT(const Slice& user_key) { - if (ts_sz_ == 0 || persist_user_defined_timestamps_) { + if (ts_sz_ == 0) { return; } const Comparator* ucmp = GetInternalKeyComparator().user_comparator(); @@ -1812,9 +1866,7 @@ void MemTable::MaybeUpdateNewestUDT(const Slice& user_key) { } const Slice& MemTable::GetNewestUDT() const { - // This path should not be invoked for MemTables that does not enable the UDT - // in Memtable only feature. - assert(ts_sz_ > 0 && !persist_user_defined_timestamps_); + assert(ts_sz_ > 0); return newest_udt_; } diff --git a/db/memtable.h b/db/memtable.h index 7032a3af449c..fb3d2323156b 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include #include #include #include @@ -30,6 +29,7 @@ #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" #include "table/multiget_context.h" +#include "util/atomic.h" #include "util/cast_util.h" #include "util/dynamic_bloom.h" #include "util/hash.h" @@ -64,6 +64,7 @@ struct ImmutableMemTableOptions { uint32_t protection_bytes_per_key; bool allow_data_in_errors; bool paranoid_memory_checks; + bool memtable_veirfy_per_key_checksum_on_seek; }; // Batched counters to updated when inserting keys in one write batch. @@ -354,13 +355,13 @@ class ReadOnlyMemTable { // be flushed to storage // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. - uint64_t GetNextLogNumber() const { return mem_next_logfile_number_; } + uint64_t GetNextLogNumber() const { return mem_next_walfile_number_; } // Sets the next active logfile number when this memtable is about to // be flushed to storage // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. - void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } + void SetNextLogNumber(uint64_t num) { mem_next_walfile_number_ = num; } // REQUIRES: db_mutex held. void SetID(uint64_t id) { id_ = id; } @@ -496,6 +497,10 @@ class ReadOnlyMemTable { return false; } + void MarkForFlush() { marked_for_flush_.StoreRelaxed(true); } + + bool IsMarkedForFlush() const { return marked_for_flush_.LoadRelaxed(); } + protected: friend class MemTableList; @@ -511,7 +516,7 @@ class ReadOnlyMemTable { VersionEdit edit_; // The log files earlier than this number can be deleted. - uint64_t mem_next_logfile_number_{0}; + uint64_t mem_next_walfile_number_{0}; // Memtable id to track flush. uint64_t id_ = 0; @@ -524,6 +529,8 @@ class ReadOnlyMemTable { // Flush job info of the current memtable. std::unique_ptr flush_job_info_; + + RelaxedAtomic marked_for_flush_{false}; }; class MemTable final : public ReadOnlyMemTable { @@ -561,7 +568,7 @@ class MemTable final : public ReadOnlyMemTable { // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't // require external synchronization. The value may be less accurate though size_t ApproximateMemoryUsageFast() const { - return approximate_memory_usage_.load(std::memory_order_relaxed); + return approximate_memory_usage_.LoadRelaxed(); } size_t MemoryAllocatedBytes() const override { @@ -681,49 +688,42 @@ class MemTable final : public ReadOnlyMemTable { // Update counters and flush status after inserting a whole write batch // Used in concurrent memtable inserts. void BatchPostProcess(const MemTablePostProcessInfo& update_counters) { - num_entries_.fetch_add(update_counters.num_entries, - std::memory_order_relaxed); - data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed); + table_->BatchPostProcess(); + num_entries_.FetchAddRelaxed(update_counters.num_entries); + data_size_.FetchAddRelaxed(update_counters.data_size); if (update_counters.num_deletes != 0) { - num_deletes_.fetch_add(update_counters.num_deletes, - std::memory_order_relaxed); + num_deletes_.FetchAddRelaxed(update_counters.num_deletes); } if (update_counters.num_range_deletes > 0) { - num_range_deletes_.fetch_add(update_counters.num_range_deletes, - std::memory_order_relaxed); + num_range_deletes_.FetchAddRelaxed(update_counters.num_range_deletes); + // noop for skip-list memtable + // Besides correctness test in stress test, memtable flush record count + // check will catch this if it were not noop. + // range_del_table_->BatchPostProcess(); } UpdateFlushState(); } - uint64_t NumEntries() const override { - return num_entries_.load(std::memory_order_relaxed); - } + uint64_t NumEntries() const override { return num_entries_.LoadRelaxed(); } - uint64_t NumDeletion() const override { - return num_deletes_.load(std::memory_order_relaxed); - } + uint64_t NumDeletion() const override { return num_deletes_.LoadRelaxed(); } uint64_t NumRangeDeletion() const override { - return num_range_deletes_.load(std::memory_order_relaxed); + return num_range_deletes_.LoadRelaxed(); } - uint64_t GetDataSize() const override { - return data_size_.load(std::memory_order_relaxed); - } + uint64_t GetDataSize() const override { return data_size_.LoadRelaxed(); } - size_t write_buffer_size() const { - return write_buffer_size_.load(std::memory_order_relaxed); - } + size_t write_buffer_size() const { return write_buffer_size_.LoadRelaxed(); } // Dynamically change the memtable's capacity. If set below the current usage, // the next key added will trigger a flush. Can only increase size when // memtable prefix bloom is disabled, since we can't easily allocate more - // space. + // space. Non-atomic update ok because this is only called with DB mutex held. void UpdateWriteBufferSize(size_t new_write_buffer_size) { if (bloom_filter_ == nullptr || - new_write_buffer_size < write_buffer_size_) { - write_buffer_size_.store(new_write_buffer_size, - std::memory_order_relaxed); + new_write_buffer_size < write_buffer_size_.LoadRelaxed()) { + write_buffer_size_.StoreRelaxed(new_write_buffer_size); } } @@ -815,9 +815,11 @@ class MemTable final : public ReadOnlyMemTable { bool IsFragmentedRangeTombstonesConstructed() const override { return fragmented_range_tombstone_list_.get() != nullptr || - is_range_del_table_empty_; + is_range_del_table_empty_.LoadRelaxed(); } + // Gets the newest user defined timestamps in the memtable. This should only + // be called when user defined timestamp is enabled. const Slice& GetNewestUDT() const override; // Returns Corruption status if verification fails. @@ -825,6 +827,9 @@ class MemTable final : public ReadOnlyMemTable { uint32_t protection_bytes_per_key, bool allow_data_in_errors = false); + // Validate the checksum of the key/value pair. + Status ValidateKey(const char* key, bool allow_data_in_errors); + private: enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; @@ -839,16 +844,22 @@ class MemTable final : public ReadOnlyMemTable { ConcurrentArena arena_; std::unique_ptr table_; std::unique_ptr range_del_table_; - std::atomic_bool is_range_del_table_empty_; + // This is OK to be relaxed access because consistency between table_ and + // range_del_table_ is provided by explicit multi-versioning with sequence + // numbers. It's ok for stale memory to say the range_del_table_ is empty when + // it's actually not because if it was relevant to our read (based on sequence + // number), the relaxed memory read would get a sufficiently updated value + // because of the ordering provided by LastPublishedSequence(). + RelaxedAtomic is_range_del_table_empty_; // Total data size of all data inserted - std::atomic data_size_; - std::atomic num_entries_; - std::atomic num_deletes_; - std::atomic num_range_deletes_; + RelaxedAtomic data_size_; + RelaxedAtomic num_entries_; + RelaxedAtomic num_deletes_; + RelaxedAtomic num_range_deletes_; // Dynamically changeable memtable option - std::atomic write_buffer_size_; + RelaxedAtomic write_buffer_size_; // The sequence number of the kv that was inserted first std::atomic first_seqno_; @@ -884,7 +895,7 @@ class MemTable final : public ReadOnlyMemTable { // keep track of memory usage in table_, arena_, and range_del_table_. // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` - std::atomic approximate_memory_usage_; + RelaxedAtomic approximate_memory_usage_; // max range deletions in a memtable, before automatic flushing, 0 for // unlimited. @@ -893,14 +904,10 @@ class MemTable final : public ReadOnlyMemTable { // Size in bytes for the user-defined timestamps. size_t ts_sz_; - // Whether to persist user-defined timestamps - bool persist_user_defined_timestamps_; - // Newest user-defined timestamp contained in this MemTable. For ts1, and ts2 // if Comparator::CompareTimestamp(ts1, ts2) > 0, ts1 is considered newer than // ts2. We track this field for a MemTable if its column family has UDT - // feature enabled and the `persist_user_defined_timestamp` flag is false. - // Otherwise, this field just contains an empty Slice. + // feature enabled. Slice newest_udt_; // Updates flush_state_ using ShouldFlushNow() @@ -939,14 +946,22 @@ class MemTable final : public ReadOnlyMemTable { // makes sure there is a single range tombstone writer to invalidate cache std::mutex range_del_mutex_; +#if defined(__cpp_lib_atomic_shared_ptr) + CoreLocalArray< + std::atomic>> + cached_range_tombstone_; +#else CoreLocalArray> cached_range_tombstone_; +#endif void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, const Slice& key, const Slice& value, ValueType type, SequenceNumber s, char* checksum_ptr); void MaybeUpdateNewestUDT(const Slice& user_key); + + const std::function key_validation_callback_; }; const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 2643110a13c3..93d8b05f836d 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -51,9 +51,7 @@ void MemTableListVersion::UnrefMemTable( MemTableListVersion::MemTableListVersion( size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old) - : max_write_buffer_number_to_maintain_( - old.max_write_buffer_number_to_maintain_), - max_write_buffer_size_to_maintain_( + : max_write_buffer_size_to_maintain_( old.max_write_buffer_size_to_maintain_), parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) { memlist_ = old.memlist_; @@ -69,10 +67,8 @@ MemTableListVersion::MemTableListVersion( MemTableListVersion::MemTableListVersion( size_t* parent_memtable_list_memory_usage, - int max_write_buffer_number_to_maintain, int64_t max_write_buffer_size_to_maintain) - : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain), - max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain), + : max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain), parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {} void MemTableListVersion::Ref() { ++refs_; } @@ -323,8 +319,7 @@ void MemTableListVersion::Remove(ReadOnlyMemTable* m, memlist_.remove(m); m->MarkFlushed(); - if (max_write_buffer_size_to_maintain_ > 0 || - max_write_buffer_number_to_maintain_ > 0) { + if (max_write_buffer_size_to_maintain_ > 0) { memlist_history_.push_front(m); // Unable to get size of mutable memtable at this point, pass 0 to // TrimHistory as a best effort. @@ -356,9 +351,6 @@ bool MemTableListVersion::MemtableLimitExceeded(size_t usage) { // whether to trim history return MemoryAllocatedBytesExcludingLast() + usage >= static_cast(max_write_buffer_size_to_maintain_); - } else if (max_write_buffer_number_to_maintain_ > 0) { - return memlist_.size() + memlist_history_.size() > - static_cast(max_write_buffer_number_to_maintain_); } else { return false; } @@ -382,6 +374,19 @@ bool MemTableListVersion::TrimHistory(autovector* to_delete, return ret; } +const Slice& MemTableListVersion::GetNewestUDT() const { + static Slice kEmptySlice; + for (auto it = memlist_.begin(); it != memlist_.end(); ++it) { + ReadOnlyMemTable* m = *it; + Slice timestamp = m->GetNewestUDT(); + assert(!timestamp.empty() || m->IsEmpty()); + if (!timestamp.empty()) { + return m->GetNewestUDT(); + } + } + return kEmptySlice; +} + // Returns true if there is at least one memtable on which flush has // not yet started. bool MemTableList::IsFlushPending() const { diff --git a/db/memtable_list.h b/db/memtable_list.h index 155878bdc268..eb42e1c7276a 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -45,7 +45,6 @@ class MemTableListVersion { explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old); explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, - int max_write_buffer_number_to_maintain, int64_t max_write_buffer_size_to_maintain); void Ref(); @@ -150,6 +149,12 @@ class MemTableListVersion { int NumFlushed() const { return static_cast(memlist_history_.size()); } + // Gets the newest user defined timestamps from the immutable memtables. + // This returns the newest user defined timestamp found in the most recent + // immutable memtable. This should only be called when user defined timestamp + // is enabled. + const Slice& GetNewestUDT() const; + private: friend class MemTableList; @@ -209,8 +214,6 @@ class MemTableListVersion { // (used during Transaction validation) std::list memlist_history_; - // Maximum number of MemTables to keep in memory (including both flushed - const int max_write_buffer_number_to_maintain_; // Maximum size of MemTables to keep in memory (including both flushed // and not-yet-flushed tables). const int64_t max_write_buffer_size_to_maintain_; @@ -238,13 +241,11 @@ class MemTableList { public: // A list of memtables. explicit MemTableList(int min_write_buffer_number_to_merge, - int max_write_buffer_number_to_maintain, int64_t max_write_buffer_size_to_maintain) : imm_flush_needed(false), imm_trim_needed(false), min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), current_(new MemTableListVersion(¤t_memory_usage_, - max_write_buffer_number_to_maintain, max_write_buffer_size_to_maintain)), num_flush_not_started_(0), commit_in_progress_(false), diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index cefb4653d616..c5589b2643a0 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -33,12 +33,12 @@ std::string ValueWithWriteTime(std::string value, uint64_t write_time) { class MemTableListTest : public testing::Test { public: std::string dbname; - DB* db; + std::unique_ptr db; Options options; std::vector handles; std::atomic file_number; - MemTableListTest() : db(nullptr), file_number(1) { + MemTableListTest() : file_number(1) { dbname = test::PerThreadDBPath("memtable_list_test"); options.create_if_missing = true; EXPECT_OK(DestroyDB(dbname, options)); @@ -88,8 +88,7 @@ class MemTableListTest : public testing::Test { } } handles.clear(); - delete db; - db = nullptr; + db.reset(); EXPECT_OK(DestroyDB(dbname, options, cf_descs)); } } @@ -112,7 +111,8 @@ class MemTableListTest : public testing::Test { WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); WriteController write_controller(10000000u); - VersionSet versions(dbname, &immutable_db_options, env_options, + VersionSet versions(dbname, &immutable_db_options, + MutableDBOptions{db_options}, env_options, table_cache.get(), &write_buffer_manager, &write_controller, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", @@ -163,7 +163,8 @@ class MemTableListTest : public testing::Test { WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); WriteController write_controller(10000000u); - VersionSet versions(dbname, &immutable_db_options, env_options, + VersionSet versions(dbname, &immutable_db_options, + MutableDBOptions{db_options}, env_options, table_cache.get(), &write_buffer_manager, &write_controller, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", @@ -220,7 +221,7 @@ class MemTableListTest : public testing::Test { TEST_F(MemTableListTest, Empty) { // Create an empty MemTableList and validate basic functions. - MemTableList list(1, 0, 0); + MemTableList list(1, 0); ASSERT_EQ(0, list.NumNotFlushed()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); @@ -239,10 +240,8 @@ TEST_F(MemTableListTest, Empty) { TEST_F(MemTableListTest, GetTest) { // Create MemTableList int min_write_buffer_number_to_merge = 2; - int max_write_buffer_number_to_maintain = 0; int64_t max_write_buffer_size_to_maintain = 0; MemTableList list(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain, max_write_buffer_size_to_maintain); SequenceNumber seq = 1; @@ -407,10 +406,8 @@ TEST_F(MemTableListTest, GetTest) { TEST_F(MemTableListTest, GetFromHistoryTest) { // Create MemTableList int min_write_buffer_number_to_merge = 2; - int max_write_buffer_number_to_maintain = 2; int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize; MemTableList list(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain, max_write_buffer_size_to_maintain); SequenceNumber seq = 1; @@ -653,11 +650,9 @@ TEST_F(MemTableListTest, FlushPendingTest) { // Create MemTableList int min_write_buffer_number_to_merge = 3; - int max_write_buffer_number_to_maintain = 7; int64_t max_write_buffer_size_to_maintain = 7 * static_cast(options.write_buffer_size); MemTableList list(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain, max_write_buffer_size_to_maintain); // Create some MemTables @@ -949,13 +944,11 @@ TEST_F(MemTableListTest, AtomicFlushTest) { // Create MemTableLists int min_write_buffer_number_to_merge = 3; - int max_write_buffer_number_to_maintain = 7; int64_t max_write_buffer_size_to_maintain = 7 * static_cast(options.write_buffer_size); autovector lists; for (int i = 0; i != num_cfs; ++i) { lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain, max_write_buffer_size_to_maintain)); } @@ -1104,11 +1097,9 @@ TEST_F(MemTableListWithTimestampTest, GetTableNewestUDT) { // Create MemTableList int min_write_buffer_number_to_merge = 1; - int max_write_buffer_number_to_maintain = 4; int64_t max_write_buffer_size_to_maintain = 4 * static_cast(options.write_buffer_size); MemTableList list(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain, max_write_buffer_size_to_maintain); // Create some MemTables diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 2576aae840d7..0261ba0e27db 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -497,6 +497,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, ikey.sequence <= latest_snapshot_ ? CompactionFilter::Decision::kKeep : FilterMerge(orig_ikey.user_key, value_slice); + // FIXME: should also check for kRemove here if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil && range_del_agg != nullptr && range_del_agg->ShouldDelete( diff --git a/db/merge_helper.h b/db/merge_helper.h index 39bd15f60876..098b9b5baba6 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -250,7 +250,7 @@ class MergeHelper { // Parallel with keys_; stores the operands mutable MergeContext merge_context_; - StopWatchNano filter_timer_; + StopWatchNano<> filter_timer_; uint64_t total_filter_time_; Statistics* stats_; @@ -307,7 +307,7 @@ class MergeOutputIterator { Slice key() { return Slice(*it_keys_); } Slice value() { return Slice(*it_values_); } - bool Valid() { return it_keys_ != merge_helper_->keys().rend(); } + bool Valid() const { return it_keys_ != merge_helper_->keys().rend(); } private: const MergeHelper* merge_helper_; diff --git a/db/merge_operator.cc b/db/merge_operator.cc index bb5dbbc36533..ef12f726d393 100644 --- a/db/merge_operator.cc +++ b/db/merge_operator.cc @@ -32,6 +32,7 @@ bool MergeOperator::FullMergeV3(const MergeOperationInputV3& merge_in, MergeOperationOutputV3* merge_out) const { assert(merge_out); + Slice value_of_default; // avoid warning about in_v2 pointing at this MergeOperationInput in_v2(merge_in.key, nullptr, merge_in.operand_list, merge_in.logger); @@ -66,7 +67,6 @@ bool MergeOperator::FullMergeV3(const MergeOperationInputV3& merge_in, const bool has_default_column = WideColumnsHelper::HasDefaultColumn(existing_columns); - Slice value_of_default; if (has_default_column) { value_of_default = existing_columns.front().value(); } diff --git a/db/merge_test.cc b/db/merge_test.cc index 0592856b7353..5f3546d6ce93 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -19,6 +19,7 @@ #include "rocksdb/utilities/db_ttl.h" #include "rocksdb/wide_columns.h" #include "test_util/testharness.h" +#include "util/cast_util.h" #include "util/coding.h" #include "utilities/merge_operators.h" @@ -96,9 +97,9 @@ class EnvMergeTest : public EnvWrapper { uint64_t EnvMergeTest::now_nanos_count_{0}; std::unique_ptr EnvMergeTest::singleton_; -std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, +std::unique_ptr OpenDb(const std::string& dbname, const bool ttl = false, const size_t max_successive_merges = 0) { - DB* db; + std::unique_ptr db; Options options; options.create_if_missing = true; options.merge_operator = std::make_shared(); @@ -109,7 +110,7 @@ std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, if (ttl) { DBWithTTL* db_with_ttl; s = DBWithTTL::Open(options, dbname, &db_with_ttl); - db = db_with_ttl; + db.reset(db_with_ttl); } else { s = DB::Open(options, dbname, &db); } @@ -118,7 +119,7 @@ std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for // session ID) EnvMergeTest::now_nanos_count_ = 0; - return std::shared_ptr(db); + return db; } // Imagine we are maintaining a set of uint64 counters. @@ -128,7 +129,7 @@ std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, // This is a quick implementation without a Merge operation. class Counters { protected: - std::shared_ptr db_; + UnownedPtr db_; WriteOptions put_option_; ReadOptions get_option_; @@ -137,7 +138,7 @@ class Counters { uint64_t default_; public: - explicit Counters(std::shared_ptr db, uint64_t defaultCount = 0) + explicit Counters(UnownedPtr db, uint64_t defaultCount = 0) : db_(db), put_option_(), get_option_(), @@ -242,7 +243,7 @@ class MergeBasedCounters : public Counters { WriteOptions merge_option_; // for merge public: - explicit MergeBasedCounters(std::shared_ptr db, uint64_t defaultCount = 0) + explicit MergeBasedCounters(UnownedPtr db, uint64_t defaultCount = 0) : Counters(db, defaultCount), merge_option_() {} // mapped to a rocksdb Merge operation @@ -261,7 +262,7 @@ class MergeBasedCounters : public Counters { } }; -void dumpDb(DB* db) { +void dumpDb(const std::unique_ptr& db) { auto it = std::unique_ptr(db->NewIterator(ReadOptions())); for (it->SeekToFirst(); it->Valid(); it->Next()) { // uint64_t value = DecodeFixed64(it->value().data()); @@ -270,7 +271,8 @@ void dumpDb(DB* db) { assert(it->status().ok()); // Check for any errors found during the scan } -void testCounters(Counters& counters, DB* db, bool test_compaction) { +void testCounters(Counters& counters, const std::unique_ptr& db, + bool test_compaction) { FlushOptions o; o.wait = true; @@ -320,7 +322,8 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) { } } -void testCountersWithFlushAndCompaction(Counters& counters, DB* db) { +void testCountersWithFlushAndCompaction(Counters& counters, + const std::unique_ptr& db) { ASSERT_OK(db->Put({}, "1", "1")); ASSERT_OK(db->Flush(FlushOptions())); @@ -388,12 +391,12 @@ void testCountersWithFlushAndCompaction(Counters& counters, DB* db) { SyncPoint::GetInstance()->EnableProcessing(); port::Thread set_options_thread([&]() { - ASSERT_OK(static_cast(db)->SetOptions( + ASSERT_OK(static_cast_with_check(db.get())->SetOptions( {{"disable_auto_compactions", "false"}})); }); TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact"); port::Thread compact_thread([&]() { - ASSERT_OK(static_cast(db)->CompactRange( + ASSERT_OK(static_cast_with_check(db.get())->CompactRange( CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr)); }); @@ -440,8 +443,8 @@ void testSuccessiveMerge(Counters& counters, size_t max_num_merges, } } -void testPartialMerge(Counters* counters, DB* db, size_t max_merge, - size_t min_merge, size_t count) { +void testPartialMerge(Counters* counters, const std::unique_ptr& db, + size_t max_merge, size_t min_merge, size_t count) { FlushOptions o; o.wait = true; @@ -481,8 +484,8 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge, ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U); } -void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, - size_t num_merges) { +void testSingleBatchSuccessiveMerge(const std::unique_ptr& db, + size_t max_num_merges, size_t num_merges) { ASSERT_GT(num_merges, max_num_merges); Slice key("BatchSuccessiveMerge"); @@ -520,13 +523,13 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { auto db = OpenDb(dbname, use_ttl); { - Counters counters(db, 0); - testCounters(counters, db.get(), true); + Counters counters(db.get(), 0); + testCounters(counters, db, true); } { - MergeBasedCounters counters(db, 0); - testCounters(counters, db.get(), use_compression); + MergeBasedCounters counters(db.get(), 0); + testCounters(counters, db, use_compression); } } @@ -535,10 +538,10 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { { size_t max_merge = 5; auto db = OpenDb(dbname, use_ttl, max_merge); - MergeBasedCounters counters(db, 0); - testCounters(counters, db.get(), use_compression); + MergeBasedCounters counters(db.get(), 0); + testCounters(counters, db, use_compression); testSuccessiveMerge(counters, max_merge, max_merge * 2); - testSingleBatchSuccessiveMerge(db.get(), 5, 7); + testSingleBatchSuccessiveMerge(db, 5, 7); ASSERT_OK(db->Close()); ASSERT_OK(DestroyDB(dbname, Options())); } @@ -549,16 +552,15 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { uint32_t min_merge = 2; for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) { auto db = OpenDb(dbname, use_ttl, max_merge); - MergeBasedCounters counters(db, 0); - testPartialMerge(&counters, db.get(), max_merge, min_merge, count); + MergeBasedCounters counters(db.get(), 0); + testPartialMerge(&counters, db, max_merge, min_merge, count); ASSERT_OK(db->Close()); ASSERT_OK(DestroyDB(dbname, Options())); } { auto db = OpenDb(dbname, use_ttl, max_merge); - MergeBasedCounters counters(db, 0); - testPartialMerge(&counters, db.get(), max_merge, min_merge, - min_merge * 10); + MergeBasedCounters counters(db.get(), 0); + testPartialMerge(&counters, db, max_merge, min_merge, min_merge * 10); ASSERT_OK(db->Close()); ASSERT_OK(DestroyDB(dbname, Options())); } @@ -567,18 +569,18 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { { { auto db = OpenDb(dbname); - MergeBasedCounters counters(db, 0); + MergeBasedCounters counters(db.get(), 0); counters.add("test-key", 1); counters.add("test-key", 1); counters.add("test-key", 1); ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } - DB* reopen_db; + std::unique_ptr reopen_db; ASSERT_OK(DB::Open(Options(), dbname, &reopen_db)); std::string value; ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value)); - delete reopen_db; + reopen_db.reset(); ASSERT_OK(DestroyDB(dbname, Options())); } @@ -587,13 +589,13 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { std::cout << "Test merge-operator not set after reopen (recovery case)\n"; { auto db = OpenDb(dbname); - MergeBasedCounters counters(db, 0); + MergeBasedCounters counters(db.get(), 0); counters.add("test-key", 1); counters.add("test-key", 1); counters.add("test-key", 1); } - DB* reopen_db; + std::unique_ptr reopen_db; ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument()); } */ @@ -614,8 +616,8 @@ TEST_F(MergeTest, MergeWithCompactionAndFlush) { { auto db = OpenDb(dbname); { - MergeBasedCounters counters(db, 0); - testCountersWithFlushAndCompaction(counters, db.get()); + MergeBasedCounters counters(db.get(), 0); + testCountersWithFlushAndCompaction(counters, db); } } ASSERT_OK(DestroyDB(dbname, Options())); diff --git a/db/multi_scan.cc b/db/multi_scan.cc new file mode 100644 index 000000000000..3d3855e0946d --- /dev/null +++ b/db/multi_scan.cc @@ -0,0 +1,76 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +using MultiScanIterator = MultiScan::MultiScanIterator; + +MultiScan::MultiScan(const ReadOptions& read_options, + const MultiScanArgs& scan_opts, DB* db, + ColumnFamilyHandle* cfh) + : read_options_(read_options), scan_opts_(scan_opts), db_(db), cfh_(cfh) { + bool slow_path = false; + // Setup read_options with iterate_uuper_bound based on the first scan. + // Subsequent scans will update and allocate a new DB iterator as necessary + if (scan_opts.GetScanRanges()[0].range.limit) { + upper_bound_ = *scan_opts.GetScanRanges()[0].range.limit; + read_options_.iterate_upper_bound = &upper_bound_; + } else { + read_options_.iterate_upper_bound = nullptr; + } + for (const auto& opts : scan_opts.GetScanRanges()) { + // Check that all the ScanOptions either specify an upper bound or not. If + // its mixed we take the slow path which avoids calling Prepare: we have to + // reallocate the Iterator with updated read_options everytime we switch + // between upper bound or no upper bound, which complicates Prepare. + if (opts.range.limit.has_value() != + scan_opts.GetScanRanges()[0].range.limit.has_value()) { + slow_path = true; + break; + } + } + db_iter_.reset(db->NewIterator(read_options_, cfh)); + if (!slow_path) { + db_iter_->Prepare(scan_opts); + } +} + +MultiScanIterator& MultiScanIterator::operator++() { + status_ = db_iter_->status(); + if (!status_.ok()) { + throw MultiScanException(status_); + } + + if (idx_ >= scan_opts_.size()) { + throw std::logic_error("Index out of range"); + } + idx_++; + if (idx_ < scan_opts_.size()) { + // Check if we need to update read_options_ + if (scan_opts_[idx_].range.limit.has_value() != + (read_options_.iterate_upper_bound != nullptr)) { + if (scan_opts_[idx_].range.limit) { + *upper_bound_ = *scan_opts_[idx_].range.limit; + read_options_.iterate_upper_bound = upper_bound_; + } else { + read_options_.iterate_upper_bound = nullptr; + } + db_iter_.reset(db_->NewIterator(read_options_, cfh_)); + scan_.Reset(db_iter_.get()); + } else if (scan_opts_[idx_].range.limit) { + *upper_bound_ = *scan_opts_[idx_].range.limit; + } + db_iter_->Seek(*scan_opts_[idx_].range.start); + status_ = db_iter_->status(); + if (!status_.ok()) { + throw MultiScanException(status_); + } + } + return *this; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index eb3ed078c79e..7709a80fcc59 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -303,6 +303,48 @@ TEST_F(ObsoleteFilesTest, BlobFiles) { ASSERT_EQ(deleted_files, expected_deleted_files); } +TEST_F(ObsoleteFilesTest, GetSortedWalFilesHangsAfterNoopPurge) { + // This test used to trigger a hang in `DB::GetSortedWalFiles()`, where it + // would wait for a no-op purge that did not signal the CV upon completion. + + // Grab an iterator and flush to switch the super version. That way, when the + // iterator is destroyed, it will go through the purge path. + DB* db = + db_.get(); // Only using `db` makes it clear we only use DB-level APIs. + ASSERT_OK(db->Put(WriteOptions(), "key", "value")); + std::unique_ptr iter(db->NewIterator(ReadOptions())); + ASSERT_OK(db->Flush(FlushOptions())); + + // Sync points ensure `GetSortedWalFiles()` waits for a purge after + // `FindObsoleteFiles()` releases the mutex but before its corresponding purge + // completes. + SyncPoint::GetInstance()->SetCallBack( + "FindObsoleteFiles::PostMutexUnlock", [&](void* /* arg */) { + TEST_SYNC_POINT( + "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:" + "InCallback:1"); + TEST_SYNC_POINT( + "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:" + "InCallback:2"); + }); + SyncPoint::GetInstance()->LoadDependency({ + {"ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:InCallback:1", + "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:Thread:Begin"}, + {"DBImpl::GetSortedWalFilesImpl:WaitPurge", + "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:InCallback:2"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread get_sorted_wal_files_thread([db]() { + TEST_SYNC_POINT( + "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:Thread:Begin"); + VectorWalPtr files; + ASSERT_OK(db->GetSortedWalFiles(files)); + }); + iter.reset(); + get_sorted_wal_files_thread.join(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/options_file_test.cc b/db/options_file_test.cc index 7e48f0cf38c1..f420d0dff4df 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -66,16 +66,16 @@ TEST_F(OptionsFileTest, NumberOfOptionsFiles) { opt.create_if_missing = true; ASSERT_OK(DestroyDB(dbname_, opt)); std::unordered_set filename_history; - DB* db; + std::unique_ptr db; for (int i = 0; i < kReopenCount; ++i) { ASSERT_OK(DB::Open(opt, dbname_, &db)); int num_options_files = 0; - UpdateOptionsFiles(db, &filename_history, &num_options_files); + UpdateOptionsFiles(db.get(), &filename_history, &num_options_files); ASSERT_GT(num_options_files, 0); ASSERT_LE(num_options_files, 2); // Make sure we always keep the latest option files. - VerifyOptionsFileName(db, filename_history); - delete db; + VerifyOptionsFileName(db.get(), filename_history); + db.reset(); } } diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index c439c1ffedf7..fbd38c6c26ee 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -38,8 +38,8 @@ const std::string kDbName = namespace ROCKSDB_NAMESPACE { -std::shared_ptr OpenDb(bool read_only = false) { - DB* db; +std::unique_ptr OpenDb(bool read_only = false) { + std::unique_ptr db; Options options; options.create_if_missing = true; options.max_open_files = -1; @@ -61,7 +61,7 @@ std::shared_ptr OpenDb(bool read_only = false) { s = DB::OpenForReadOnly(options, kDbName, &db); } EXPECT_OK(s); - return std::shared_ptr(db); + return db; } class PerfContextTest : public testing::Test {}; @@ -659,12 +659,11 @@ TEST_F(PerfContextTest, ToString) { TEST_F(PerfContextTest, MergeOperatorTime) { ASSERT_OK(DestroyDB(kDbName, Options())); - DB* db; + std::unique_ptr db; Options options; options.create_if_missing = true; options.merge_operator = MergeOperators::CreateStringAppendOperator(); - Status s = DB::Open(options, kDbName, &db); - EXPECT_OK(s); + EXPECT_OK(DB::Open(options, kDbName, &db)); std::string val; ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1")); @@ -704,7 +703,7 @@ TEST_F(PerfContextTest, MergeOperatorTime) { #endif EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0); - delete db; + db.reset(); } TEST_F(PerfContextTest, CopyAndMove) { @@ -972,13 +971,12 @@ TEST_F(PerfContextTest, CPUTimer) { TEST_F(PerfContextTest, MergeOperandCount) { ASSERT_OK(DestroyDB(kDbName, Options())); - DB* db = nullptr; Options options; options.create_if_missing = true; options.merge_operator = MergeOperators::CreateStringAppendOperator(); + std::unique_ptr db; ASSERT_OK(DB::Open(options, kDbName, &db)); - std::unique_ptr db_guard(db); constexpr size_t num_keys = 3; const std::string key_prefix("key"); @@ -1007,7 +1005,7 @@ TEST_F(PerfContextTest, MergeOperandCount) { for (size_t j = 0; j <= i; ++j) { // Take a snapshot before each Merge so they are preserved and not // collapsed during flush. - snapshots.emplace_back(db); + snapshots.emplace_back(db.get()); ASSERT_OK(db->Merge(WriteOptions(), keys[i], value + std::to_string(j))); } @@ -1124,7 +1122,7 @@ TEST_F(PerfContextTest, MergeOperandCount) { TEST_F(PerfContextTest, WriteMemtableTimePerfLevel) { // Write and check time ASSERT_OK(DestroyDB(kDbName, Options())); - std::shared_ptr db = OpenDb(); + auto db = OpenDb(); SetPerfLevel(PerfLevel::kEnableWait); PerfContext* perf_ctx = get_perf_context(); diff --git a/db/periodic_task_scheduler.cc b/db/periodic_task_scheduler.cc index 2f266529c57c..ee3f07b91e73 100644 --- a/db/periodic_task_scheduler.cc +++ b/db/periodic_task_scheduler.cc @@ -26,6 +26,7 @@ static const std::map kDefaultPeriodSeconds = { {PeriodicTaskType::kPersistStats, kInvalidPeriodSec}, {PeriodicTaskType::kFlushInfoLog, 10}, {PeriodicTaskType::kRecordSeqnoTime, kInvalidPeriodSec}, + {PeriodicTaskType::kTriggerCompaction, 12 * 60 * 60} // 12 hours }; static const std::map kPeriodicTaskTypeNames = { @@ -33,16 +34,20 @@ static const std::map kPeriodicTaskTypeNames = { {PeriodicTaskType::kPersistStats, "pst_st"}, {PeriodicTaskType::kFlushInfoLog, "flush_info_log"}, {PeriodicTaskType::kRecordSeqnoTime, "record_seq_time"}, + {PeriodicTaskType::kTriggerCompaction, "trigger_compaction"}, }; Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type, - const PeriodicTaskFunc& fn) { - return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type)); + const PeriodicTaskFunc& fn, + bool run_immediately) { + return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type), + run_immediately); } Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn, - uint64_t repeat_period_seconds) { + uint64_t repeat_period_seconds, + bool run_immediately) { MutexLock l(&timer_mutex); static std::atomic initial_delay(0); @@ -65,10 +70,13 @@ Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type, std::string unique_id = kPeriodicTaskTypeNames.at(task_type) + std::to_string(id_++); - bool succeeded = timer_->Add( - fn, unique_id, - (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond, - repeat_period_seconds * kMicrosInSecond); + uint64_t initial_delay_micros = + (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond; + if (!run_immediately) { + initial_delay_micros += repeat_period_seconds * kMicrosInSecond; + } + bool succeeded = timer_->Add(fn, unique_id, initial_delay_micros, + repeat_period_seconds * kMicrosInSecond); if (!succeeded) { return Status::Aborted("Failed to register periodic task"); } diff --git a/db/periodic_task_scheduler.h b/db/periodic_task_scheduler.h index 3ac8a3b9cee6..8511f5f2d8e7 100644 --- a/db/periodic_task_scheduler.h +++ b/db/periodic_task_scheduler.h @@ -21,6 +21,7 @@ enum class PeriodicTaskType : uint8_t { kPersistStats, kFlushInfoLog, kRecordSeqnoTime, + kTriggerCompaction, kMax, }; @@ -42,13 +43,16 @@ class PeriodicTaskScheduler { PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete; // Register a task with its default repeat period. Thread safe call. - Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn); + // @param run_immediately If true, the task will run soon after it's + // scheduled, instead of waiting for the repeat period. + Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn, + bool run_immediately); // Register a task with specified repeat period. 0 is an invalid argument // (kInvalidPeriodSec). To stop the task, please use Unregister(). // Thread safe call. Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn, - uint64_t repeat_period_seconds); + uint64_t repeat_period_seconds, bool run_immediately); // Unregister the task. Thread safe call. Status Unregister(PeriodicTaskType task_type); diff --git a/db/periodic_task_scheduler_test.cc b/db/periodic_task_scheduler_test.cc index baf74ed15e3a..5575333b095a 100644 --- a/db/periodic_task_scheduler_test.cc +++ b/db/periodic_task_scheduler_test.cc @@ -56,6 +56,12 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) { SyncPoint::GetInstance()->SetCallBack( "DBImpl::FlushInfoLog:StartRunning", [&](void*) { flush_info_log_counter++; }); + + int trigger_compaction_counter = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::TriggerPeriodicCompaction:StartRunning", + [&](void*) { trigger_compaction_counter++; }); + SyncPoint::GetInstance()->EnableProcessing(); Reopen(options); @@ -70,7 +76,7 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) { const PeriodicTaskScheduler& scheduler = dbfull()->TEST_GetPeriodicTaskScheduler(); - ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum()); + ASSERT_EQ((int)PeriodicTaskType::kMax - 1, scheduler.TEST_GetValidTaskNum()); ASSERT_EQ(1, dump_st_counter); ASSERT_EQ(1, pst_st_counter); @@ -103,14 +109,14 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) { ASSERT_EQ(3, pst_st_counter); ASSERT_EQ(4, flush_info_log_counter); - ASSERT_EQ(1u, scheduler.TEST_GetValidTaskNum()); + ASSERT_EQ(2u, scheduler.TEST_GetValidTaskNum()); // Re-enable one task ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}})); ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec); ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec); - ASSERT_EQ(2, scheduler.TEST_GetValidTaskNum()); + ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum()); dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); @@ -118,6 +124,16 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) { ASSERT_EQ(3, pst_st_counter); ASSERT_EQ(5, flush_info_log_counter); + ASSERT_EQ(0, trigger_compaction_counter); + dbfull()->TEST_WaitForPeriodicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(12 * 60 * 60)); + }); + ASSERT_EQ(1, trigger_compaction_counter); + dbfull()->TEST_WaitForPeriodicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(12 * 60 * 60)); + }); + ASSERT_EQ(2, trigger_compaction_counter); + Close(); } @@ -141,16 +157,18 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) { [&](void*) { pst_st_counter++; }); SyncPoint::GetInstance()->EnableProcessing(); - auto dbs = std::vector(kInstanceNum); + auto dbs = std::vector>(kInstanceNum); for (int i = 0; i < kInstanceNum; i++) { ASSERT_OK( DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i]))); } - auto dbi = static_cast_with_check(dbs[kInstanceNum - 1]); + auto dbi = static_cast_with_check(dbs[kInstanceNum - 1].get()); const PeriodicTaskScheduler& scheduler = dbi->TEST_GetPeriodicTaskScheduler(); - ASSERT_EQ(kInstanceNum * 3, scheduler.TEST_GetValidTaskNum()); + // kRecordSeqnoTime is not registered since the feature is not enabled + ASSERT_EQ(kInstanceNum * ((int)PeriodicTaskType::kMax - 1), + scheduler.TEST_GetValidTaskNum()); int expected_run = kInstanceNum; dbi->TEST_WaitForPeriodicTaskRun( @@ -172,7 +190,7 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) { int half = kInstanceNum / 2; for (int i = 0; i < half; i++) { - delete dbs[i]; + dbs[i].reset(); } expected_run += (kInstanceNum - half) * 2; @@ -186,7 +204,7 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) { for (int i = half; i < kInstanceNum; i++) { ASSERT_OK(dbs[i]->Close()); - delete dbs[i]; + dbs[i].reset(); } } @@ -211,11 +229,11 @@ TEST_F(PeriodicTaskSchedulerTest, MultiEnv) { options1.env = mock_env2.get(); std::string dbname = test::PerThreadDBPath("multi_env_test"); - DB* db; + std::unique_ptr db; ASSERT_OK(DB::Open(options2, dbname, &db)); ASSERT_OK(db->Close()); - delete db; + db.reset(); Close(); } diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 27aa0e28d0c9..6e2909ca5159 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -98,7 +98,7 @@ class PlainTableDBTest : public testing::Test, private: std::string dbname_; Env* env_; - DB* db_; + std::unique_ptr db_; bool mmap_mode_; Options last_options_; @@ -107,7 +107,7 @@ class PlainTableDBTest : public testing::Test, PlainTableDBTest() : env_(Env::Default()) {} ~PlainTableDBTest() override { - delete db_; + db_.reset(); EXPECT_OK(DestroyDB(dbname_, Options())); } @@ -115,7 +115,7 @@ class PlainTableDBTest : public testing::Test, mmap_mode_ = GetParam(); dbname_ = test::PerThreadDBPath("plain_table_db_test"); EXPECT_OK(DestroyDB(dbname_, Options())); - db_ = nullptr; + db_.reset(); Reopen(); } @@ -144,14 +144,11 @@ class PlainTableDBTest : public testing::Test, return options; } - DBImpl* dbfull() { return static_cast_with_check(db_); } + DBImpl* dbfull() { return static_cast_with_check(db_.get()); } void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); } - void Close() { - delete db_; - db_ = nullptr; - } + void Close() { db_.reset(); } bool mmap_mode() const { return mmap_mode_; } @@ -162,24 +159,21 @@ class PlainTableDBTest : public testing::Test, } void Destroy(Options* options) { - delete db_; - db_ = nullptr; + db_.reset(); ASSERT_OK(DestroyDB(dbname_, *options)); } - Status PureReopen(Options* options, DB** db) { + Status PureReopen(Options* options, std::unique_ptr* db) { return DB::Open(*options, dbname_, db); } Status ReopenForReadOnly(Options* options) { - delete db_; - db_ = nullptr; + db_.reset(); return DB::OpenForReadOnly(*options, dbname_, &db_); } Status TryReopen(Options* options = nullptr) { - delete db_; - db_ = nullptr; + db_.reset(); Options opts; if (options != nullptr) { opts = *options; @@ -495,8 +489,7 @@ TEST_P(PlainTableDBTest, Flush) { ASSERT_GT(int_num, 0U); TablePropertiesCollection ptc; - ASSERT_OK( - static_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc)); ASSERT_EQ(1U, ptc.size()); auto row = ptc.begin(); auto tp = row->second; @@ -1339,11 +1332,7 @@ TEST_P(PlainTableDBTest, AdaptiveTable) { INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool()); TEST_P(PlainTableDBTest, DeleteRangeNotSupported) { - // XXX: After attempting DeleteRange with PlainTable, Writes will permanently - // fail. Even if re-opening the DB, if WAL is used, the WAL is not recoverable - // (without manual intervention). Furthermore, a partial write batch can - // be exposed to readers, breaking WriteBatch atomicity. - for (bool use_write_batch : {/*false, */ true}) { + for (bool use_write_batch : {false, true}) { DestroyAndReopen(); ASSERT_OK(Put("a0001111", "1")); @@ -1362,12 +1351,7 @@ TEST_P(PlainTableDBTest, DeleteRangeNotSupported) { ASSERT_EQ(Get("a0001111"), "1"); ASSERT_EQ(Get("b0001111"), "2"); ASSERT_EQ(Get("c0001111"), "3"); - if (use_write_batch) { - // XXX: broken WriteBatch atomicity - ASSERT_EQ(Get("d0001111"), "4"); - } else { - ASSERT_EQ(Get("d0001111"), "NOT_FOUND"); - } + ASSERT_EQ(Get("d0001111"), "NOT_FOUND"); // expect WriteBatch atomicity ASSERT_EQ(Get("e0001111"), "NOT_FOUND"); ASSERT_EQ(Put("e0001111", "5").code(), Status::Code::kNotSupported); @@ -1377,8 +1361,14 @@ TEST_P(PlainTableDBTest, DeleteRangeNotSupported) { ASSERT_EQ(dbfull()->TEST_FlushMemTable().code(), Status::Code::kNotSupported); - // XXX: WAL is not recoverable - ASSERT_EQ(TryReopen().code(), Status::Code::kNotSupported); + // WAL is recoverable (at least in standard configurations) + ASSERT_OK(TryReopen()); + + ASSERT_EQ(Get("a0001111"), "1"); + ASSERT_EQ(Get("b0001111"), "2"); + ASSERT_EQ(Get("c0001111"), "3"); + ASSERT_EQ(Get("d0001111"), "NOT_FOUND"); + ASSERT_EQ(Get("e0001111"), "NOT_FOUND"); } } diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 9b1d4ed79e6a..d1559b50721b 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -220,8 +220,6 @@ class SamePrefixTransform : public SliceTransform { return false; } - bool InRange(const Slice& dst) const override { return dst == prefix_; } - bool FullLengthEnabled(size_t* /*len*/) const override { return false; } }; @@ -229,8 +227,8 @@ class SamePrefixTransform : public SliceTransform { class PrefixTest : public testing::Test { public: - std::shared_ptr OpenDb() { - DB* db; + std::unique_ptr OpenDb() { + std::unique_ptr db; options.create_if_missing = true; options.write_buffer_size = FLAGS_write_buffer_size; @@ -251,7 +249,7 @@ class PrefixTest : public testing::Test { Status s = DB::Open(options, kDbName, &db); EXPECT_OK(s); - return std::shared_ptr(db); + return db; } void FirstOption() { option_config_ = kBegin; } @@ -304,7 +302,7 @@ class PrefixTest : public testing::Test { }; TEST(SamePrefixTest, InDomainTest) { - DB* db; + std::unique_ptr db; Options options; options.create_if_missing = true; options.prefix_extractor.reset(new SamePrefixTransform("HHKB")); @@ -331,7 +329,7 @@ TEST(SamePrefixTest, InDomainTest) { ASSERT_EQ(db_iter->value(), "idk"); delete db_iter; - delete db; + db.reset(); ASSERT_OK(DestroyDB(kDbName, Options())); } @@ -348,7 +346,7 @@ TEST(SamePrefixTest, InDomainTest) { ASSERT_TRUE(db_iter->Valid()); ASSERT_OK(db_iter->status()); delete db_iter; - delete db; + db.reset(); ASSERT_OK(DestroyDB(kDbName, Options())); } } diff --git a/db/repair.cc b/db/repair.cc index 73671154ba5f..941d69dedc11 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -100,13 +100,15 @@ class Repairer { db_options_(SanitizeOptions(dbname_, db_options)), immutable_db_options_(ImmutableDBOptions(db_options_)), icmp_(default_cf_opts.comparator), - default_cf_opts_( - SanitizeOptions(immutable_db_options_, default_cf_opts)), + default_cf_opts_(SanitizeCfOptions(immutable_db_options_, + /*read_only*/ false, + default_cf_opts)), default_iopts_( ImmutableOptions(immutable_db_options_, default_cf_opts_)), default_mopts_(MutableCFOptions(default_cf_opts_)), - unknown_cf_opts_( - SanitizeOptions(immutable_db_options_, unknown_cf_opts)), + unknown_cf_opts_(SanitizeCfOptions(immutable_db_options_, + /*read_only*/ false, + unknown_cf_opts)), create_unknown_cfs_(create_unknown_cfs), raw_table_cache_( // TableCache can be small since we expect each table to be opened @@ -118,8 +120,8 @@ class Repairer { /*io_tracer=*/nullptr, db_session_id_)), wb_(db_options_.db_write_buffer_size), wc_(db_options_.delayed_write_rate), - vset_(dbname_, &immutable_db_options_, file_options_, - raw_table_cache_.get(), &wb_, &wc_, + vset_(dbname_, &immutable_db_options_, MutableDBOptions{db_options_}, + file_options_, raw_table_cache_.get(), &wb_, &wc_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", db_session_id_, db_options.daily_offpeak_time_utc, /*error_handler=*/nullptr, /*read_only=*/false), @@ -456,8 +458,9 @@ class Repairer { meta.file_creation_time = current_time; SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); - auto write_hint = - cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0); + auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint( + /*level=*/0, db_options_.calculate_sst_write_lifetime_hint_set); + std::vector> range_del_iters; auto range_del_iter = mem->NewRangeTombstoneIterator( @@ -575,14 +578,7 @@ class Repairer { static_cast(props->user_defined_timestamps_persisted); } if (status.ok()) { - uint64_t tail_size = 0; - bool contain_no_data_blocks = - props->num_entries > 0 && - (props->num_entries == props->num_range_deletions); - if (props->tail_start_offset > 0 || contain_no_data_blocks) { - assert(props->tail_start_offset <= file_size); - tail_size = file_size - props->tail_start_offset; - } + uint64_t tail_size = FileMetaData::CalculateTailSize(file_size, *props); t->meta.tail_size = tail_size; } ColumnFamilyData* cfd = nullptr; @@ -708,17 +704,17 @@ class Repairer { VersionEdit dummy_edit; for (const auto* table : cf_id_and_tables.second) { // TODO(opt): separate out into multiple levels + const auto& meta = table->meta; dummy_edit.AddFile( - 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), - table->meta.fd.GetFileSize(), table->meta.smallest, - table->meta.largest, table->meta.fd.smallest_seqno, - table->meta.fd.largest_seqno, table->meta.marked_for_compaction, - table->meta.temperature, table->meta.oldest_blob_file_number, - table->meta.oldest_ancester_time, table->meta.file_creation_time, - table->meta.epoch_number, table->meta.file_checksum, - table->meta.file_checksum_func_name, table->meta.unique_id, - table->meta.compensated_range_deletion_size, table->meta.tail_size, - table->meta.user_defined_timestamps_persisted); + 0, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.epoch_number, meta.file_checksum, + meta.file_checksum_func_name, meta.unique_id, + meta.compensated_range_deletion_size, meta.tail_size, + meta.user_defined_timestamps_persisted, meta.min_timestamp, + meta.max_timestamp); } s = dummy_version_builder.Apply(&dummy_edit); if (s.ok()) { diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc index cb247edfb767..e474c583d892 100644 --- a/db/seqno_time_test.cc +++ b/db/seqno_time_test.cc @@ -96,7 +96,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { } ASSERT_OK(dbfull()->TEST_WaitForCompact()); - // All data is hot, only output to penultimate level + // All data is hot, only output to proximal level ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); @@ -185,7 +185,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { options.num_levels = kNumLevels; options.level_compaction_dynamic_level_bytes = true; // TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if - // the penultimate level score > 1, but the hot is not cold enough to compact + // the proximal level score > 1, but the hot is not cold enough to compact // to last level, which will keep triggering compaction. options.disable_auto_compactions = true; DestroyAndReopen(options); @@ -205,7 +205,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - // All data is hot, only output to penultimate level + // All data is hot, only output to proximal level ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); @@ -661,14 +661,14 @@ TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) { options.stats_dump_period_sec = 0; options.stats_persist_period_sec = 0; - auto dbs = std::vector(kInstanceNum); + auto dbs = std::vector>(kInstanceNum); for (int i = 0; i < kInstanceNum; i++) { ASSERT_OK( DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i]))); } // Make sure the second instance has the worker enabled - auto dbi = static_cast_with_check(dbs[1]); + auto dbi = static_cast_with_check(dbs[1].get()); WriteOptions wo; for (int i = 0; i < 200; i++) { ASSERT_OK(dbi->Put(wo, Key(i), "value")); @@ -680,7 +680,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) { for (int i = 0; i < kInstanceNum; i++) { ASSERT_OK(dbs[i]->Close()); - delete dbs[i]; + dbs[i].reset(); } } @@ -753,7 +753,7 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - // make sure the data is all compacted to penultimate level if the feature is + // make sure the data is all compacted to proximal level if the feature is // on, otherwise, compacted to the last level. if (options.preclude_last_level_data_seconds > 0) { ASSERT_GT(NumTableFilesAtLevel(5), 0); @@ -792,9 +792,8 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { } ASSERT_GT(num_seqno_zeroing, 0); std::vector key_versions; - ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), - std::numeric_limits::max(), - &key_versions)); + ASSERT_OK(GetAllKeyVersions( + db_.get(), {}, {}, std::numeric_limits::max(), &key_versions)); // make sure there're more than 300 keys and first 100 keys are having seqno // zeroed out, the last 100 key seqno not zeroed out ASSERT_GT(key_versions.size(), 300); @@ -919,10 +918,11 @@ TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) { ASSERT_EQ(db_->GetLatestSequenceNumber(), 0); // And even if we re-open read-write, we do not get pre-population, - // because that's only for new DBs. + // because that's only for new DBs. We just get a single bootstrap + // entry as a lower bound on write times of future writes. Reopen(track_options); sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); - ASSERT_EQ(sttm.Size(), 0); + ASSERT_EQ(sttm.Size(), 1); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0); } } diff --git a/db/seqno_to_time_mapping.cc b/db/seqno_to_time_mapping.cc index b540fd919671..36da27c5bf03 100644 --- a/db/seqno_to_time_mapping.cc +++ b/db/seqno_to_time_mapping.cc @@ -490,7 +490,7 @@ bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) { return added; } -bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno, +void SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno, uint64_t from_time, uint64_t to_time) { assert(Empty()); @@ -505,8 +505,6 @@ bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno, (to_seqno - from_seqno); pairs_.emplace_back(i, t); } - - return /*success*/ true; } std::string SeqnoToTimeMapping::ToHumanString() const { diff --git a/db/seqno_to_time_mapping.h b/db/seqno_to_time_mapping.h index 741e64369435..a74041fd9a0e 100644 --- a/db/seqno_to_time_mapping.h +++ b/db/seqno_to_time_mapping.h @@ -138,7 +138,7 @@ class SeqnoToTimeMapping { // Adds a series of mappings interpolating from from_seqno->from_time to // to_seqno->to_time. This can only be called on an empty object and both // seqno range and time range are inclusive. - bool PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno, + void PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno, uint64_t from_time, uint64_t to_time); // Append a new entry to the list. The `seqno` should be >= all previous @@ -148,6 +148,10 @@ class SeqnoToTimeMapping { // rather than creating a new entry. bool Append(SequenceNumber seqno, uint64_t time); + bool Append(std::pair seqno_time_pair) { + return Append(seqno_time_pair.first, seqno_time_pair.second); + } + // Clear all entries and (re-)enter enforced mode if not already in that // state. Enforced limits are unchanged. void Clear() { @@ -274,6 +278,48 @@ class SeqnoToTimeMapping { pair_const_iterator FindGreaterEqSeqno(SequenceNumber seqno) const; }; +// A struct to help combining settings across column families +struct MinAndMaxPreserveSeconds { + uint64_t min_preserve_seconds = std::numeric_limits::max(); + uint64_t max_preserve_seconds = std::numeric_limits::min(); + + MinAndMaxPreserveSeconds() = default; + + template + explicit MinAndMaxPreserveSeconds(const CFOpts& opts) { + Combine(opts); + } + + bool IsEnabled() const { + return min_preserve_seconds != std::numeric_limits::max(); + } + + // Incorporate another CF's settings into the result. If preserve/preclude are + // disabled for this CF, they are excluded from the result. + template + void Combine(const CFOpts& opts) { + uint64_t preserve_seconds = std::max(opts.preserve_internal_time_seconds, + opts.preclude_last_level_data_seconds); + if (preserve_seconds > 0) { + min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds); + max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds); + } + } + + // Choose how many seconds between mapping samples + uint64_t GetRecodingCadence() const { + if (IsEnabled()) { + // round up to 1 when the time_duration is smaller than + // kMaxSeqnoTimePairsPerCF + return (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) / + kMaxSeqnoTimePairsPerCF; + } else { + // disabled + return 0; + } + } +}; + // === Utility methods used for TimedPut === // // Pack a value Slice and a unix write time into buffer `buf` and return a Slice diff --git a/db/table_cache.cc b/db/table_cache.cc index 773446b6a583..0e4e9f2e5155 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -98,6 +98,8 @@ Status TableCache::GetTableReader( std::unique_ptr file; FileOptions fopts = file_options; fopts.temperature = file_temperature; + fopts.file_checksum = file_meta.file_checksum; + fopts.file_checksum_func_name = file_meta.file_checksum_func_name; Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile", const_cast(&s)); @@ -113,8 +115,7 @@ Status TableCache::GetTableReader( Status temp_s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); if (temp_s.ok()) { - temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, - nullptr); + temp_s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr); } if (temp_s.ok()) { RecordTick(ioptions_.stats, NO_FILE_OPENS); @@ -146,7 +147,8 @@ Status TableCache::GetTableReader( s = mutable_cf_options.table_factory->NewTableReader( ro, TableReaderOptions( - ioptions_, mutable_cf_options.prefix_extractor, file_options, + ioptions_, mutable_cf_options.prefix_extractor, + mutable_cf_options.compression_manager.get(), file_options, internal_comparator, mutable_cf_options.block_protection_bytes_per_key, skip_filters, immortal_tables_, false /* force_direct_prefetch */, level, @@ -205,6 +207,7 @@ Status TableCache::FindTable( RecordTick(ioptions_.stats, NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. + IGNORE_STATUS_IF_ERROR(s); } else { s = cache_.Insert(key, table_reader.get(), 1, handle); if (s.ok()) { diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 3c7d8a61d739..a3e249887ab1 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -76,7 +76,8 @@ class VersionBuilderTest : public testing::Test { oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, - /* user_defined_timestamps_persisted */ true); + /* user_defined_timestamps_persisted */ true, /* min timestamp */ "", + /* max timestamp */ ""); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; diff --git a/db/version_edit.cc b/db/version_edit.cc index f666308bc071..67a6f3cc5ba3 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -112,124 +112,9 @@ bool VersionEdit::EncodeTo(std::string* dst, f.epoch_number == kUnknownEpochNumber) { return false; } - PutVarint32(dst, kNewFile4); - PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber()); - PutVarint64(dst, f.fd.GetFileSize()); - EncodeFileBoundaries(dst, f, ts_sz.value()); - PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); - // Customized fields' format: - // +-----------------------------+ - // | 1st field's tag (varint32) | - // +-----------------------------+ - // | 1st field's size (varint32) | - // +-----------------------------+ - // | bytes for 1st field | - // | (based on size decoded) | - // +-----------------------------+ - // | | - // | ...... | - // | | - // +-----------------------------+ - // | last field's size (varint32)| - // +-----------------------------+ - // | bytes for last field | - // | (based on size decoded) | - // +-----------------------------+ - // | terminating tag (varint32) | - // +-----------------------------+ - // - // Customized encoding for fields: - // tag kPathId: 1 byte as path_id - // tag kNeedCompaction: - // now only can take one char value 1 indicating need-compaction - // - PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime); - std::string varint_oldest_ancester_time; - PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time); - TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", - &varint_oldest_ancester_time); - PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); - - PutVarint32(dst, NewFileCustomTag::kFileCreationTime); - std::string varint_file_creation_time; - PutVarint64(&varint_file_creation_time, f.file_creation_time); - TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime", - &varint_file_creation_time); - PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); - - PutVarint32(dst, NewFileCustomTag::kEpochNumber); - std::string varint_epoch_number; - PutVarint64(&varint_epoch_number, f.epoch_number); - PutLengthPrefixedSlice(dst, Slice(varint_epoch_number)); - - if (f.file_checksum_func_name != kUnknownFileChecksumFuncName) { - PutVarint32(dst, NewFileCustomTag::kFileChecksum); - PutLengthPrefixedSlice(dst, Slice(f.file_checksum)); - - PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName); - PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name)); - } - - if (f.fd.GetPathId() != 0) { - PutVarint32(dst, NewFileCustomTag::kPathId); - char p = static_cast(f.fd.GetPathId()); - PutLengthPrefixedSlice(dst, Slice(&p, 1)); - } - if (f.temperature != Temperature::kUnknown) { - PutVarint32(dst, NewFileCustomTag::kTemperature); - char p = static_cast(f.temperature); - PutLengthPrefixedSlice(dst, Slice(&p, 1)); - } - if (f.marked_for_compaction) { - PutVarint32(dst, NewFileCustomTag::kNeedCompaction); - char p = static_cast(1); - PutLengthPrefixedSlice(dst, Slice(&p, 1)); - } - if (has_min_log_number_to_keep_ && !min_log_num_written) { - PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack); - std::string varint_log_number; - PutFixed64(&varint_log_number, min_log_number_to_keep_); - PutLengthPrefixedSlice(dst, Slice(varint_log_number)); - min_log_num_written = true; - } - if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { - PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber); - std::string oldest_blob_file_number; - PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); - PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); - } - UniqueId64x2 unique_id = f.unique_id; - TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id); - if (unique_id != kNullUniqueId64x2) { - PutVarint32(dst, NewFileCustomTag::kUniqueId); - std::string unique_id_str = EncodeUniqueIdBytes(&unique_id); - PutLengthPrefixedSlice(dst, Slice(unique_id_str)); - } - if (f.compensated_range_deletion_size) { - PutVarint32(dst, kCompensatedRangeDeletionSize); - std::string compensated_range_deletion_size; - PutVarint64(&compensated_range_deletion_size, - f.compensated_range_deletion_size); - PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); - } - if (f.tail_size) { - PutVarint32(dst, NewFileCustomTag::kTailSize); - std::string varint_tail_size; - PutVarint64(&varint_tail_size, f.tail_size); - PutLengthPrefixedSlice(dst, Slice(varint_tail_size)); - } - if (!f.user_defined_timestamps_persisted) { - // The default value for the flag is true, it's only explicitly persisted - // when it's false. We are putting 0 as the value here to signal false - // (i.e. UDTS not persisted). - PutVarint32(dst, NewFileCustomTag::kUserDefinedTimestampsPersisted); - char p = static_cast(0); - PutLengthPrefixedSlice(dst, Slice(&p, 1)); - } - TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", - dst); - - PutVarint32(dst, NewFileCustomTag::kTerminate); + EncodeToNewFile4(f, new_files_[i].first, ts_sz.value(), + has_min_log_number_to_keep_, min_log_number_to_keep_, + min_log_num_written, dst); } for (const auto& blob_file_addition : blob_file_additions_) { @@ -288,9 +173,151 @@ bool VersionEdit::EncodeTo(std::string* dst, char p = static_cast(persist_user_defined_timestamps_); PutLengthPrefixedSlice(dst, Slice(&p, 1)); } + + if (HasSubcompactionProgress()) { + PutVarint32(dst, kSubcompactionProgress); + std::string progress_data; + subcompaction_progress_.EncodeTo(&progress_data); + PutLengthPrefixedSlice(dst, progress_data); + } + return true; } +void VersionEdit::EncodeToNewFile4(const FileMetaData& f, int level, + size_t ts_sz, + bool has_min_log_number_to_keep, + uint64_t min_log_number_to_keep, + bool& min_log_num_written, + std::string* dst) { + PutVarint32(dst, kNewFile4); + PutVarint32Varint64(dst, level, f.fd.GetNumber()); + PutVarint64(dst, f.fd.GetFileSize()); + EncodeFileBoundaries(dst, f, ts_sz); + PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); + // Customized fields' format: + // +-----------------------------+ + // | 1st field's tag (varint32) | + // +-----------------------------+ + // | 1st field's size (varint32) | + // +-----------------------------+ + // | bytes for 1st field | + // | (based on size decoded) | + // +-----------------------------+ + // | | + // | ...... | + // | | + // +-----------------------------+ + // | last field's size (varint32)| + // +-----------------------------+ + // | bytes for last field | + // | (based on size decoded) | + // +-----------------------------+ + // | terminating tag (varint32) | + // +-----------------------------+ + // + // Customized encoding for fields: + // tag kPathId: 1 byte as path_id + // tag kNeedCompaction: + // now only can take one char value 1 indicating need-compaction + // + PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime); + std::string varint_oldest_ancester_time; + PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", + &varint_oldest_ancester_time); + PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); + + PutVarint32(dst, NewFileCustomTag::kFileCreationTime); + std::string varint_file_creation_time; + PutVarint64(&varint_file_creation_time, f.file_creation_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime", + &varint_file_creation_time); + PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); + + PutVarint32(dst, NewFileCustomTag::kEpochNumber); + std::string varint_epoch_number; + PutVarint64(&varint_epoch_number, f.epoch_number); + PutLengthPrefixedSlice(dst, Slice(varint_epoch_number)); + + if (f.file_checksum_func_name != kUnknownFileChecksumFuncName) { + PutVarint32(dst, NewFileCustomTag::kFileChecksum); + PutLengthPrefixedSlice(dst, Slice(f.file_checksum)); + + PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName); + PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name)); + } + + if (f.fd.GetPathId() != 0) { + PutVarint32(dst, NewFileCustomTag::kPathId); + char p = static_cast(f.fd.GetPathId()); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.temperature != Temperature::kUnknown) { + PutVarint32(dst, NewFileCustomTag::kTemperature); + char p = static_cast(f.temperature); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.marked_for_compaction) { + PutVarint32(dst, NewFileCustomTag::kNeedCompaction); + char p = static_cast(1); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (has_min_log_number_to_keep && !min_log_num_written) { + PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack); + std::string varint_log_number; + PutFixed64(&varint_log_number, min_log_number_to_keep); + PutLengthPrefixedSlice(dst, Slice(varint_log_number)); + min_log_num_written = true; + } + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber); + std::string oldest_blob_file_number; + PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); + PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); + } + UniqueId64x2 unique_id = f.unique_id; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id); + if (unique_id != kNullUniqueId64x2) { + PutVarint32(dst, NewFileCustomTag::kUniqueId); + std::string unique_id_str = EncodeUniqueIdBytes(&unique_id); + PutLengthPrefixedSlice(dst, Slice(unique_id_str)); + } + if (f.compensated_range_deletion_size) { + PutVarint32(dst, NewFileCustomTag::kCompensatedRangeDeletionSize); + std::string compensated_range_deletion_size; + PutVarint64(&compensated_range_deletion_size, + f.compensated_range_deletion_size); + PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); + } + if (f.tail_size) { + PutVarint32(dst, NewFileCustomTag::kTailSize); + std::string varint_tail_size; + PutVarint64(&varint_tail_size, f.tail_size); + PutLengthPrefixedSlice(dst, Slice(varint_tail_size)); + } + if (!f.user_defined_timestamps_persisted) { + // The default value for the flag is true, it's only explicitly persisted + // when it's false. We are putting 0 as the value here to signal false + // (i.e. UDTS not persisted). + PutVarint32(dst, NewFileCustomTag::kUserDefinedTimestampsPersisted); + char p = static_cast(0); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + // Encode min/max timestamp if they are non-empty + if (!f.min_timestamp.empty()) { + PutVarint32(dst, NewFileCustomTag::kMinTimestamp); + PutLengthPrefixedSlice(dst, Slice(f.min_timestamp)); + } + if (!f.max_timestamp.empty()) { + PutVarint32(dst, NewFileCustomTag::kMaxTimestamp); + PutLengthPrefixedSlice(dst, Slice(f.max_timestamp)); + } + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", + dst); + + PutVarint32(dst, NewFileCustomTag::kTerminate); +} static bool GetInternalKey(Slice* input, InternalKey* dst) { Slice str; if (GetLengthPrefixedSlice(input, &str)) { @@ -301,12 +328,12 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) { } } -bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) { +bool VersionEdit::GetLevel(Slice* input, int* level, int& max_level) { uint32_t v = 0; if (GetVarint32(input, &v)) { *level = v; - if (max_level_ < *level) { - max_level_ = *level; + if (max_level < *level) { + max_level = *level; } return true; } else { @@ -314,16 +341,18 @@ bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) { } } -const char* VersionEdit::DecodeNewFile4From(Slice* input) { - const char* msg = nullptr; +const char* VersionEdit::DecodeNewFile4From(Slice* input, int& max_level, + uint64_t& min_log_number_to_keep, + bool& has_min_log_number_to_keep, + NewFiles& new_files, + FileMetaData& f) { int level = 0; - FileMetaData f; uint64_t number = 0; uint32_t path_id = 0; uint64_t file_size = 0; SequenceNumber smallest_seqno = 0; SequenceNumber largest_seqno = kMaxSequenceNumber; - if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) && + if (GetLevel(input, &level, max_level) && GetVarint64(input, &number) && GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) && GetInternalKey(input, &f.largest) && GetVarint64(input, &smallest_seqno) && @@ -381,10 +410,10 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { case kMinLogNumberToKeepHack: // This is a hack to encode kMinLogNumberToKeep in a // forward-compatible fashion. - if (!GetFixed64(&field, &min_log_number_to_keep_)) { + if (!GetFixed64(&field, &min_log_number_to_keep)) { return "deleted log number malformatted"; } - has_min_log_number_to_keep_ = true; + has_min_log_number_to_keep = true; break; case kOldestBlobFileNumber: if (!GetVarint64(&field, &f.oldest_blob_file_number)) { @@ -396,7 +425,7 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "temperature field wrong size"; } else { Temperature casted_field = static_cast(field[0]); - if (casted_field <= Temperature::kCold) { + if (casted_field < Temperature::kLastTemperature) { f.temperature = casted_field; } } @@ -423,6 +452,12 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { } f.user_defined_timestamps_persisted = (field[0] == 1); break; + case kMinTimestamp: + f.min_timestamp = field.ToString(); + break; + case kMaxTimestamp: + f.max_timestamp = field.ToString(); + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -436,13 +471,12 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { } f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno); - new_files_.push_back(std::make_pair(level, f)); + new_files.emplace_back(level, f); return nullptr; } void VersionEdit::EncodeFileBoundaries(std::string* dst, - const FileMetaData& meta, - size_t ts_sz) const { + const FileMetaData& meta, size_t ts_sz) { if (ts_sz == 0 || meta.user_defined_timestamps_persisted) { PutLengthPrefixedSlice(dst, meta.smallest.Encode()); PutLengthPrefixedSlice(dst, meta.largest.Encode()); @@ -545,7 +579,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) { break; case kCompactCursor: - if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) { + if (GetLevel(&input, &level, max_level_) && + GetInternalKey(&input, &key)) { // Here we re-use the output format of compact pointer in LevelDB // to persist compact_cursors_ compact_cursors_.push_back(std::make_pair(level, key)); @@ -558,7 +593,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) { case kDeletedFile: { uint64_t number = 0; - if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) { + if (GetLevel(&input, &level, max_level_) && + GetVarint64(&input, &number)) { deleted_files_.insert(std::make_pair(level, number)); } else { if (!msg) { @@ -571,8 +607,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) { case kNewFile: { uint64_t number = 0; uint64_t file_size = 0; - if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && - GetVarint64(&input, &file_size) && + if (GetLevel(&input, &level, max_level_) && + GetVarint64(&input, &number) && GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest)) { f.fd = FileDescriptor(number, 0, file_size); @@ -589,8 +625,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) { uint64_t file_size = 0; SequenceNumber smallest_seqno = 0; SequenceNumber largest_seqno = kMaxSequenceNumber; - if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && - GetVarint64(&input, &file_size) && + if (GetLevel(&input, &level, max_level_) && + GetVarint64(&input, &number) && GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest) && GetVarint64(&input, &smallest_seqno) && @@ -612,8 +648,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) { uint64_t file_size = 0; SequenceNumber smallest_seqno = 0; SequenceNumber largest_seqno = kMaxSequenceNumber; - if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && - GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) && + if (GetLevel(&input, &level, max_level_) && + GetVarint64(&input, &number) && GetVarint32(&input, &path_id) && + GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest) && GetVarint64(&input, &smallest_seqno) && @@ -630,7 +667,10 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } case kNewFile4: { - msg = DecodeNewFile4From(&input); + FileMetaData ignored_file; + msg = DecodeNewFile4From(&input, max_level_, min_log_number_to_keep_, + has_min_log_number_to_keep_, new_files_, + ignored_file); break; } @@ -767,6 +807,23 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } break; + case kSubcompactionProgress: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "SubcompactionProgress not prefixed by length"; + break; + } + + SubcompactionProgress progress; + Status s = progress.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + SetSubcompactionProgress(progress); + break; + } + default: if (tag & kTagSafeIgnoreMask) { // Tag from future which can be safely ignored. @@ -933,6 +990,10 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append("\n FullHistoryTsLow: "); r.append(Slice(full_history_ts_low_).ToString(hex_key)); } + if (HasSubcompactionProgress()) { + r.append("\n SubcompactionProgress: "); + r.append(subcompaction_progress_.ToString()); + } r.append("\n}\n"); return r; } @@ -1082,9 +1143,301 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key); } + if (HasSubcompactionProgress()) { + jw << "SubcompactionProgress" << subcompaction_progress_.ToString(); + } + jw.EndObject(); return jw.Get(); } +void SubcompactionProgressPerLevel::EncodeTo(std::string* dst) const { + if (num_processed_output_records_ > 0) { + PutVarint32( + dst, + SubcompactionProgressPerLevelCustomTag::kNumProcessedOutputRecords); + std::string varint_records; + PutVarint64(&varint_records, num_processed_output_records_); + PutLengthPrefixedSlice(dst, varint_records); + } + + if (!output_files_.empty()) { + PutVarint32(dst, SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta); + std::string files_data; + EncodeOutputFiles(&files_data); + PutLengthPrefixedSlice(dst, files_data); + } + + PutVarint32(dst, SubcompactionProgressPerLevelCustomTag:: + kSubcompactionProgressPerLevelTerminate); +} + +Status SubcompactionProgressPerLevel::DecodeFrom(Slice* input) { + Clear(); + + while (true) { + uint32_t tag = 0; + if (!GetVarint32(input, &tag)) { + return Status::Corruption("SubcompactionProgressPerLevel", "tag error"); + } + + if (tag == SubcompactionProgressPerLevelCustomTag:: + kSubcompactionProgressPerLevelTerminate) { + break; + } + + Slice field; + if (!GetLengthPrefixedSlice(input, &field)) { + return Status::Corruption("SubcompactionProgressPerLevel", + "field length prefixed slice error"); + } + + switch (tag) { + case SubcompactionProgressPerLevelCustomTag::kNumProcessedOutputRecords: { + if (!GetVarint64(&field, &num_processed_output_records_)) { + return Status::Corruption("SubcompactionProgressPerLevel", + "invalid num_processed_output_records_"); + } + break; + } + + case SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta: { + Status s = DecodeOutputFiles(&field, output_files_); + if (!s.ok()) { + return s; + } + break; + } + + default: + // Forward compatibility: Handle unknown tags + if ((tag & SubcompactionProgressPerLevelCustomTag:: + kSubcompactionProgressPerLevelCustomTagSafeIgnoreMask) != + 0) { + break; + } else { + return Status::NotSupported("SubcompactionProgress", + "unsupported critical custom field"); + } + } + } + + return Status::OK(); +} + +void SubcompactionProgressPerLevel::EncodeOutputFiles(std::string* dst) const { + size_t new_files_count = + output_files_.size() > last_persisted_output_files_count_ + ? output_files_.size() - last_persisted_output_files_count_ + : 0; + + assert(new_files_count > 0); + + PutVarint32(dst, static_cast(new_files_count)); + + for (size_t i = last_persisted_output_files_count_; i < output_files_.size(); + ++i) { + std::string file_dst; + bool ignored_min_log_written = false; + + VersionEdit::EncodeToNewFile4( + output_files_[i], -1 /* level */, 0 /* ts_sz */, + false /* has_min_log_number_to_keep */, 0 /* min_log_number_to_keep */, + ignored_min_log_written, &file_dst); + + PutLengthPrefixedSlice(dst, file_dst); + } +} + +Status SubcompactionProgressPerLevel::DecodeOutputFiles( + Slice* input, autovector& output_files) { + uint32_t new_file_count = 0; + if (!GetVarint32(input, &new_file_count)) { + return Status::Corruption("SubcompactionProgressPerLevel", + "new output file count"); + } + + assert(output_files.size() == 0); + + output_files.reserve(new_file_count); + + for (uint32_t i = 0; i < new_file_count; ++i) { + Slice file_input; + if (!GetLengthPrefixedSlice(input, &file_input)) { + return Status::Corruption("SubcompactionProgressPerLevel", + "output file metadata"); + } + + uint32_t tag = 0; + if (!GetVarint32(&file_input, &tag) || tag != kNewFile4) { + return Status::Corruption("SubcompactionProgressPerLevel", + "expected kNewFile4 tag"); + } + + int ignored_max_level = -1; + uint64_t ignored_min_log_number_to_keep = 0; + bool ignored_has_min_log_number_to_keep = false; + VersionEdit::NewFiles ignored_new_files; + FileMetaData file; + + const char* err = VersionEdit::DecodeNewFile4From( + &file_input, ignored_max_level, ignored_min_log_number_to_keep, + ignored_has_min_log_number_to_keep, ignored_new_files, file); + + if (err != nullptr) { + return Status::Corruption("SubcompactionProgressPerLevel", err); + } + + output_files.push_back(std::move(file)); + } + + return Status::OK(); +} + +void SubcompactionProgress::EncodeTo(std::string* dst) const { + if (!next_internal_key_to_compact.empty()) { + PutVarint32(dst, SubcompactionProgressCustomTag::kNextInternalKeyToCompact); + PutLengthPrefixedSlice(dst, next_internal_key_to_compact); + } + + PutVarint32(dst, SubcompactionProgressCustomTag::kNumProcessedInputRecords); + std::string varint_records; + PutVarint64(&varint_records, num_processed_input_records); + PutLengthPrefixedSlice(dst, varint_records); + + if (output_level_progress.GetOutputFiles().size() > + output_level_progress.GetLastPersistedOutputFilesCount()) { + PutVarint32(dst, SubcompactionProgressCustomTag::kOutputLevelProgress); + std::string level_progress_data; + output_level_progress.EncodeTo(&level_progress_data); + PutLengthPrefixedSlice(dst, level_progress_data); + } + + if (proximal_output_level_progress.GetOutputFiles().size() > + proximal_output_level_progress.GetLastPersistedOutputFilesCount()) { + PutVarint32(dst, + SubcompactionProgressCustomTag::kProximalOutputLevelProgress); + std::string level_progress_data; + proximal_output_level_progress.EncodeTo(&level_progress_data); + PutLengthPrefixedSlice(dst, level_progress_data); + } + PutVarint32(dst, + SubcompactionProgressCustomTag::kSubcompactionProgressTerminate); +} + +Status SubcompactionProgress::DecodeFrom(Slice* input) { + Clear(); + + while (true) { + uint32_t custom_tag = 0; + if (!GetVarint32(input, &custom_tag)) { + return Status::Corruption("SubcompactionProgress", + "custom field tag error"); + } + + if (custom_tag == + SubcompactionProgressCustomTag::kSubcompactionProgressTerminate) { + break; + } + + Slice field; + if (!GetLengthPrefixedSlice(input, &field)) { + return Status::Corruption("SubcompactionProgress", + "custom field length prefixed slice error"); + } + + switch (custom_tag) { + case SubcompactionProgressCustomTag::kNextInternalKeyToCompact: + next_internal_key_to_compact = field.ToString(); + break; + + case SubcompactionProgressCustomTag::kNumProcessedInputRecords: + if (!GetVarint64(&field, &num_processed_input_records)) { + return Status::Corruption("SubcompactionProgress", + "invalid num_processed_input_records"); + } + break; + + case SubcompactionProgressCustomTag::kOutputLevelProgress: { + Status s = output_level_progress.DecodeFrom(&field); + if (!s.ok()) { + return s; + } + break; + } + + case SubcompactionProgressCustomTag::kProximalOutputLevelProgress: { + Status s = proximal_output_level_progress.DecodeFrom(&field); + if (!s.ok()) { + return s; + } + break; + } + + default: + if ((custom_tag & SubcompactionProgressCustomTag:: + kSubcompactionProgressCustomTagSafeIgnoreMask) != + 0) { + break; + } else { + return Status::NotSupported("SubcompactionProgress", + "unsupported critical custom field"); + } + } + } + + return Status::OK(); +} + +bool SubcompactionProgressBuilder::ProcessVersionEdit(const VersionEdit& edit) { + if (!edit.HasSubcompactionProgress()) { + return false; + } + + const SubcompactionProgress& progress = edit.GetSubcompactionProgress(); + + MergeDeltaProgress(progress); + + has_subcompaction_progress_ = true; + + return true; +} + +void SubcompactionProgressBuilder::MergeDeltaProgress( + const SubcompactionProgress& delta_progress) { + accumulated_subcompaction_progress_.next_internal_key_to_compact = + delta_progress.next_internal_key_to_compact; + + accumulated_subcompaction_progress_.num_processed_input_records = + delta_progress.num_processed_input_records; + + MaybeMergeDeltaProgressPerLevel( + accumulated_subcompaction_progress_.output_level_progress, + delta_progress.output_level_progress); + + MaybeMergeDeltaProgressPerLevel( + accumulated_subcompaction_progress_.proximal_output_level_progress, + delta_progress.proximal_output_level_progress); +} + +void SubcompactionProgressBuilder::MaybeMergeDeltaProgressPerLevel( + SubcompactionProgressPerLevel& accumulated_level_progress, + const SubcompactionProgressPerLevel& delta_level_progress) { + const auto& delta_files = delta_level_progress.GetOutputFiles(); + if (delta_files.empty()) { + return; + } + for (const FileMetaData& file : delta_files) { + accumulated_level_progress.AddToOutputFiles(file); // Stored as copy + } + + accumulated_level_progress.SetNumProcessedOutputRecords( + delta_level_progress.GetNumProcessedOutputRecords()); +} + +void SubcompactionProgressBuilder::Clear() { + accumulated_subcompaction_progress_.Clear(); + has_subcompaction_progress_ = false; +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_edit.h b/db/version_edit.h index 9189b4628109..ee6a6b01be43 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -25,6 +25,7 @@ #include "rocksdb/advanced_options.h" #include "table/table_reader.h" #include "table/unique_id_impl.h" +#include "test_util/sync_point.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -72,6 +73,23 @@ enum Tag : uint32_t { kWalAddition2, kWalDeletion2, kPersistUserDefinedTimestamps, + kSubcompactionProgress, +}; + +enum SubcompactionProgressPerLevelCustomTag : uint32_t { + kSubcompactionProgressPerLevelTerminate = 1, // End of fields marker + kOutputFilesDelta = 2, + kNumProcessedOutputRecords = 3, + kSubcompactionProgressPerLevelCustomTagSafeIgnoreMask = 1 << 16, +}; + +enum SubcompactionProgressCustomTag : uint32_t { + kSubcompactionProgressTerminate = 1, // End of fields marker + kNextInternalKeyToCompact = 2, + kNumProcessedInputRecords = 3, + kOutputLevelProgress = 4, + kProximalOutputLevelProgress = 5, + kSubcompactionProgressCustomTagSafeIgnoreMask = 1 << 16, }; enum NewFileCustomTag : uint32_t { @@ -110,7 +128,7 @@ constexpr uint64_t kUnknownOldestAncesterTime = 0; constexpr uint64_t kUnknownNewestKeyTime = 0; constexpr uint64_t kUnknownFileCreationTime = 0; constexpr uint64_t kUnknownEpochNumber = 0; -// If `Options::allow_ingest_behind` is true, this epoch number +// If `Options::cf_allow_ingest_behind` is true, this epoch number // will be dedicated to files ingested behind. constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1; @@ -259,6 +277,14 @@ struct FileMetaData { // false, it's explicitly written to Manifest. bool user_defined_timestamps_persisted = true; + // Minimum user-defined timestamp in the file. Empty if no UDT or unknown. + // This is populated from the table properties "rocksdb.timestamp_min". + std::string min_timestamp; + + // Maximum user-defined timestamp in the file. Empty if no UDT or unknown. + // This is populated from the table properties "rocksdb.timestamp_max". + std::string max_timestamp; + FileMetaData() = default; FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, @@ -271,7 +297,9 @@ struct FileMetaData { const std::string& _file_checksum_func_name, UniqueId64x2 _unique_id, const uint64_t _compensated_range_deletion_size, - uint64_t _tail_size, bool _user_defined_timestamps_persisted) + uint64_t _tail_size, bool _user_defined_timestamps_persisted, + const std::string& _min_timestamp, + const std::string& _max_timestamp) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), @@ -286,7 +314,9 @@ struct FileMetaData { file_checksum_func_name(_file_checksum_func_name), unique_id(std::move(_unique_id)), tail_size(_tail_size), - user_defined_timestamps_persisted(_user_defined_timestamps_persisted) { + user_defined_timestamps_persisted(_user_defined_timestamps_persisted), + min_timestamp(_min_timestamp), + max_timestamp(_max_timestamp) { TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); } @@ -369,7 +399,8 @@ struct FileMetaData { usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE usage += smallest.size() + largest.size() + file_checksum.size() + - file_checksum_func_name.size(); + file_checksum_func_name.size() + min_timestamp.size() + + max_timestamp.size(); return usage; } @@ -380,6 +411,33 @@ struct FileMetaData { assert(!res || fd.smallest_seqno == fd.largest_seqno); return res; } + + static uint64_t CalculateTailSize(uint64_t file_size, + const TableProperties& props) { +#ifndef NDEBUG + bool skip = false; + TEST_SYNC_POINT_CALLBACK("FileMetaData::CalculateTailSize", &skip); + if (skip) { + return 0; + } +#endif // NDEBUG + uint64_t tail_size = 0; + + // Differentiate between a file with no data blocks (tail_start_offset = 0) + // and a file with unknown tail_start_offset (also set to 0 due to + // non-negative integer storage limitation) + bool contain_no_data_blocks = + props.num_entries == 0 || + (props.num_entries > 0 && + (props.num_entries == props.num_range_deletions)); + + if (props.tail_start_offset > 0 || contain_no_data_blocks) { + assert(props.tail_start_offset <= file_size); + tail_size = file_size - props.tail_start_offset; + } + + return tail_size; + } }; // A compressed copy of file meta data that just contain minimum data needed @@ -413,12 +471,194 @@ struct LevelFilesBrief { } }; +struct SubcompactionProgressPerLevel { + uint64_t GetNumProcessedOutputRecords() const { + return num_processed_output_records_; + } + + void SetNumProcessedOutputRecords(uint64_t num) { + num_processed_output_records_ = num; + } + + const autovector& GetOutputFiles() const { + return output_files_; + } + + void AddToOutputFiles(const FileMetaData& file) { + output_files_.push_back(file); + } + + size_t GetLastPersistedOutputFilesCount() const { + return last_persisted_output_files_count_; + } + + void UpdateLastPersistedOutputFilesCount() { + last_persisted_output_files_count_ = output_files_.size(); + } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* input); + + void Clear() { + num_processed_output_records_ = 0; + output_files_.clear(); + last_persisted_output_files_count_ = 0; + } + + std::string ToString() const { + std::ostringstream oss; + oss << "SubcompactionProgressPerLevel{"; + oss << " num_processed_output_records=" << num_processed_output_records_; + oss << ", output_files_count=" << output_files_.size(); + oss << ", last_persisted_output_files_count=" + << last_persisted_output_files_count_; + oss << " }"; + return oss.str(); + } + + void TEST_ClearOutputFiles() { output_files_.clear(); } + + private: + uint64_t num_processed_output_records_ = 0; + + autovector output_files_ = {}; + + // Number of files already persisted to help calculate the new output files to + // persist in the future. This is to prevent having to persist all the output + // files metadata so far every time of a "snapshot" of a progress is persisted + // which can lead to O(1+2+...+n) = O(n^2) file metadata being persisted. The + // current approach of persisting only the delta should always persist + // exactly the number (n) of output files in total. + size_t last_persisted_output_files_count_ = 0; + + void EncodeOutputFiles(std::string* dst) const; + + Status DecodeOutputFiles(Slice* input, + autovector& temp_storage); +}; + +struct SubcompactionProgress { + std::string next_internal_key_to_compact; + + uint64_t num_processed_input_records = 0; + + SubcompactionProgressPerLevel output_level_progress; + + SubcompactionProgressPerLevel proximal_output_level_progress; + + SubcompactionProgress() = default; + + void Clear() { + next_internal_key_to_compact.clear(); + num_processed_input_records = 0; + output_level_progress.Clear(); + proximal_output_level_progress.Clear(); + } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* input); + + std::string ToString() const { + std::ostringstream oss; + oss << "SubcompactionProgress{"; + oss << " next_internal_key_to_compact="; + if (next_internal_key_to_compact.empty()) { + oss << ""; + } else { + ParsedInternalKey parsed_key; + Slice key_slice(next_internal_key_to_compact); + if (ParseInternalKey(key_slice, &parsed_key, false /* log_err_key */) + .ok()) { + oss << "user_key(hex)=" << parsed_key.user_key.ToString(true /* hex */); + oss << ", seq="; + if (parsed_key.sequence == kMaxSequenceNumber) { + oss << "kMaxSequenceNumber"; + } else { + oss << parsed_key.sequence; + } + oss << ", type="; + if (parsed_key.type == kValueTypeForSeek) { + oss << "kValueTypeForSeek"; + } else { + oss << static_cast(parsed_key.type); + } + } else { + oss << "raw=" << key_slice.ToString(true /* hex */); + } + } + oss << ", num_processed_input_records=" << num_processed_input_records; + oss << ", output_level_progress=" << output_level_progress.ToString(); + oss << ", proximal_output_level_progress=" + << proximal_output_level_progress.ToString(); + oss << " }"; + return oss.str(); + } +}; + +class VersionEdit; + +// Builder class to reconstruct complete subcompaction progress object +// from multiple decoded VersionEdits containing delta output files information +// of the same subcompaction. See +// `SubcompactionProgressPerLevel::last_persisted_output_files_count_`'s comment +// +// WARNING: This class currently assumes all input VersionEdits contain progress +// information for the SAME subcompaction. It does not validate +// progress data from different subcompactions so mixing progress from +// multiple subcompactions can result in corrupted state silently. The caller is +// responsible for ensuring all VersionEdits processed by a single instance +// of this builder correspond to the same subcompaction. +class SubcompactionProgressBuilder { + public: + SubcompactionProgressBuilder() = default; + + bool ProcessVersionEdit(const VersionEdit& edit); + + const SubcompactionProgress& GetAccumulatedSubcompactionProgress() const { + return accumulated_subcompaction_progress_; + } + + bool HasAccumulatedSubcompactionProgress() const { + return has_subcompaction_progress_; + } + + void Clear(); + + private: + void MergeDeltaProgress(const SubcompactionProgress& delta_progress); + + void MaybeMergeDeltaProgressPerLevel( + SubcompactionProgressPerLevel& accumulated_level_progress, + const SubcompactionProgressPerLevel& delta_level_progress); + + SubcompactionProgress accumulated_subcompaction_progress_; + bool has_subcompaction_progress_ = false; +}; + +// Type alias for backward compatibility - vector of subcompaction progress +using CompactionProgress = std::vector; + // The state of a DB at any given time is referred to as a Version. // Any modification to the Version is considered a Version Edit. A Version is // constructed by joining a sequence of Version Edits. Version Edits are written // to the MANIFEST file. class VersionEdit { public: + // Retrieve the table files added as well as their associated levels. + using NewFiles = std::vector>; + + static void EncodeToNewFile4(const FileMetaData& f, int level, size_t ts_sz, + bool has_min_log_number_to_keep, + uint64_t min_log_number_to_keep, + bool& min_log_num_written, std::string* dst); + + static const char* DecodeNewFile4From(Slice* input, int& max_level, + uint64_t& min_log_number_to_keep, + bool& has_min_log_number_to_keep, + NewFiles& new_files, FileMetaData& f); + void Clear(); void SetDBId(const std::string& db_id) { @@ -511,17 +751,19 @@ class VersionEdit { const std::string& file_checksum_func_name, const UniqueId64x2& unique_id, const uint64_t compensated_range_deletion_size, - uint64_t tail_size, bool user_defined_timestamps_persisted) { + uint64_t tail_size, bool user_defined_timestamps_persisted, + const std::string& min_timestamp = "", + const std::string& max_timestamp = "") { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, - FileMetaData(file, file_path_id, file_size, smallest, largest, - smallest_seqno, largest_seqno, marked_for_compaction, - temperature, oldest_blob_file_number, oldest_ancester_time, - file_creation_time, epoch_number, file_checksum, - file_checksum_func_name, unique_id, - compensated_range_deletion_size, tail_size, - user_defined_timestamps_persisted)); + FileMetaData( + file, file_path_id, file_size, smallest, largest, smallest_seqno, + largest_seqno, marked_for_compaction, temperature, + oldest_blob_file_number, oldest_ancester_time, file_creation_time, + epoch_number, file_checksum, file_checksum_func_name, unique_id, + compensated_range_deletion_size, tail_size, + user_defined_timestamps_persisted, min_timestamp, max_timestamp)); files_to_quarantine_.push_back(file); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); @@ -537,8 +779,6 @@ class VersionEdit { } } - // Retrieve the table files added as well as their associated levels. - using NewFiles = std::vector>; const NewFiles& GetNewFiles() const { return new_files_; } NewFiles& GetMutableNewFiles() { return new_files_; } @@ -708,6 +948,22 @@ class VersionEdit { full_history_ts_low_ = std::move(full_history_ts_low); } + void SetSubcompactionProgress(const SubcompactionProgress& progress) { + has_subcompaction_progress_ = true; + subcompaction_progress_ = progress; + } + + bool HasSubcompactionProgress() const { return has_subcompaction_progress_; } + + const SubcompactionProgress& GetSubcompactionProgress() const { + return subcompaction_progress_; + } + + void ClearSubcompactionProgress() { + has_subcompaction_progress_ = false; + subcompaction_progress_.Clear(); + } + // return true on success. // `ts_sz` is the size in bytes for the user-defined timestamp contained in // a user key. This argument is optional because it's only required for @@ -730,15 +986,22 @@ class VersionEdit { std::string DebugJSON(int edit_num, bool hex_key = false) const; private: - bool GetLevel(Slice* input, int* level, const char** msg); - - const char* DecodeNewFile4From(Slice* input); - + // Decode level information from serialized VersionEdit data and and track the + // maximum level seen. + // + // Parameters: + // input: Pointer to serialized data slice + // level: Output parameter for the decoded level value + // max_level: get updated if the decoded level is higher than passed in + // value + // + // Returns: true on successful decode, false on parse error + static bool GetLevel(Slice* input, int* level, int& max_level); // Encode file boundaries `FileMetaData.smallest` and `FileMetaData.largest`. // User-defined timestamps in the user key will be stripped if they shouldn't // be persisted. - void EncodeFileBoundaries(std::string* dst, const FileMetaData& meta, - size_t ts_sz) const; + static void EncodeFileBoundaries(std::string* dst, const FileMetaData& meta, + size_t ts_sz); int max_level_ = 0; std::string db_id_; @@ -789,6 +1052,9 @@ class VersionEdit { std::string full_history_ts_low_; bool persist_user_defined_timestamps_ = true; + bool has_subcompaction_progress_ = false; + SubcompactionProgress subcompaction_progress_; + // Newly created table files and blob files are eligible for deletion if they // are not registered as live files after the background jobs creating them // have finished. In case committing the VersionEdit containing such changes diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 52947c484cf6..42d83b84d627 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -117,21 +117,43 @@ Status ListColumnFamiliesHandler::ApplyVersionEdit( return s; } +Status FileChecksumRetriever::FetchFileChecksumList( + FileChecksumList& file_checksum_list) { + Status s = Status::OK(); + for (const auto& [cf, file_checksums] : cf_file_checksums_) { + [[maybe_unused]] const auto& _ = cf; + for (const auto& [file_number, info] : file_checksums) { + if (!(s = file_checksum_list.InsertOneFileChecksum( + file_number, info.first, info.second)) + .ok()) { + break; + } + } + } + return s; +} + Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** /*unused*/) { + uint32_t column_family_id = edit.GetColumnFamily(); + if (edit.IsColumnFamilyDrop()) { + cf_file_checksums_.erase(column_family_id); + } for (const auto& deleted_file : edit.GetDeletedFiles()) { - Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second); - if (!s.ok()) { - return s; + if (cf_file_checksums_.find(column_family_id) == cf_file_checksums_.end()) { + return Status::NotFound(); } + if (cf_file_checksums_[column_family_id].find(deleted_file.second) == + cf_file_checksums_[column_family_id].end()) { + return Status::NotFound(); + } + cf_file_checksums_[column_family_id].erase(deleted_file.second); } for (const auto& new_file : edit.GetNewFiles()) { - Status s = file_checksum_list_.InsertOneFileChecksum( - new_file.second.fd.GetNumber(), new_file.second.file_checksum, - new_file.second.file_checksum_func_name); - if (!s.ok()) { - return s; - } + cf_file_checksums_[column_family_id].emplace( + new_file.second.fd.GetNumber(), + std::make_pair(new_file.second.file_checksum, + new_file.second.file_checksum_func_name)); } for (const auto& new_blob_file : edit.GetBlobFileAdditions()) { std::string checksum_value = new_blob_file.GetChecksumValue(); @@ -141,11 +163,9 @@ Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit, checksum_value = kUnknownFileChecksum; checksum_method = kUnknownFileChecksumFuncName; } - Status s = file_checksum_list_.InsertOneFileChecksum( - new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method); - if (!s.ok()) { - return s; - } + cf_file_checksums_[column_family_id].emplace( + new_blob_file.GetBlobFileNumber(), + std::make_pair(checksum_value, checksum_method)); } return Status::OK(); } @@ -408,7 +428,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, if (cfd->IsDropped()) { continue; } - if (read_only_) { + if (version_set_->unchanging()) { cfd->table_cache()->SetTablesAreImmortal(); } *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false, @@ -471,8 +491,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, ColumnFamilyData* VersionEditHandler::CreateCfAndInit( const ColumnFamilyOptions& cf_options, const VersionEdit& edit) { uint32_t cf_id = edit.GetColumnFamily(); - ColumnFamilyData* cfd = - version_set_->CreateColumnFamily(cf_options, read_options_, &edit); + ColumnFamilyData* cfd = version_set_->CreateColumnFamily( + cf_options, read_options_, &edit, read_only_); assert(cfd != nullptr); cfd->set_initialized(); assert(builders_.find(cf_id) == builders_.end()); @@ -1135,6 +1155,15 @@ void DumpManifestHandler::CheckIterationResult(const log::Reader& reader, // Print out DebugStrings. Can include non-terminating null characters. fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char), cfd->current()->DebugString(hex_).size(), stdout); + + fprintf(stdout, + "By default, manifest file dump prints LSM trees as if %d levels " + "were configured, " + "which is not necessarily true for the column family (CF) this " + "manifest is associated with. " + "Please consult other DB files, such as the OPTIONS file, to " + "confirm.\n", + cfd->ioptions().num_levels); } fprintf(stdout, "next_file_number %" PRIu64 " last_sequence %" PRIu64 diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index f3637ae73075..1d4b22e3c13e 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -80,19 +80,42 @@ class ListColumnFamiliesHandler : public VersionEditHandlerBase { class FileChecksumRetriever : public VersionEditHandlerBase { public: - FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size, - FileChecksumList& file_checksum_list) - : VersionEditHandlerBase(read_options, max_read_size), - file_checksum_list_(file_checksum_list) {} + FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size) + : VersionEditHandlerBase(read_options, max_read_size) {} ~FileChecksumRetriever() override {} + Status FetchFileChecksumList(FileChecksumList& file_checksum_list); + protected: Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** /*unused*/) override; private: - FileChecksumList& file_checksum_list_; + // Map from CF to file # to string pair, where first portion of the value + // is checksum, and second portion of the value is checksum function name. + // + // [column family id A] + // | + // |-- [file #1] -> [checksum #1, checksum function name #1] + // |-- [file #2] -> [checksum #2, checksum function name #2] + // | + // ... + // | + // |-- [file #N] -> [checksum #N, checksum function name #N] + // [column family id B] + // | + // |-- [file #1] -> [checksum #1, checksum function name #1] + // | + // ... + // | + // |-- [file #M] -> [checksum #M, checksum function name #M] + // | + // ... + std::unordered_map< + uint32_t, + std::unordered_map>> + cf_file_checksums_; }; using VersionBuilderUPtr = std::unique_ptr; @@ -198,7 +221,9 @@ class VersionEditHandler : public VersionEditHandlerBase { bool prefetch_index_and_filter_in_cache, bool is_initial_load); - virtual bool MustOpenAllColumnFamilies() const { return !read_only_; } + virtual bool MustOpenAllColumnFamilies() const { + return !version_set_->unchanging(); + } const bool read_only_; std::vector column_families_; @@ -334,10 +359,10 @@ class ManifestTailer : public VersionEditHandlerPointInTime { const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent) - : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, - version_set, io_tracer, read_options, - /*allow_incomplete_valid_version=*/false, - epoch_number_requirement), + : VersionEditHandlerPointInTime( + /*read_only=*/true, column_families, version_set, io_tracer, + read_options, + /*allow_incomplete_valid_version=*/false, epoch_number_requirement), mode_(Mode::kRecovery) {} Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 25235206994a..d5f6beee93cc 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -794,6 +794,339 @@ TEST(FileMetaDataTest, UpdateBoundariesBlobIndex) { } } +class SubcompactionProgressTest : public VersionEditTest { + protected: + static constexpr uint64_t kTestFileSize = 1024; + static constexpr SequenceNumber kTestSmallestSeq = 50; + static constexpr SequenceNumber kTestLargestSeq = 150; + static constexpr uint64_t kTestOldestAncesterTime = 12345; + static constexpr uint64_t kTestFileCreationTime = 67890; + static constexpr uint64_t kTestEpochNumber = 10; + static const std::string kTestChecksumFuncName; + + FileMetaData CreateTestFile(uint64_t file_number, const std::string& prefix) { + FileMetaData file; + file.fd = FileDescriptor(file_number, 0, kTestFileSize, kTestSmallestSeq, + kTestLargestSeq); + file.smallest = InternalKey(prefix + "a", kTestSmallestSeq, kTypeValue); + file.largest = InternalKey(prefix + "z", kTestLargestSeq, kTypeValue); + file.oldest_ancester_time = kTestOldestAncesterTime; + file.file_creation_time = kTestFileCreationTime; + file.epoch_number = kTestEpochNumber; + file.file_checksum = "checksum_" + std::to_string(file_number); + file.file_checksum_func_name = kTestChecksumFuncName; + file.marked_for_compaction = false; + file.temperature = Temperature::kUnknown; + return file; + } + + // Store external file metadata objects for testing + // These simulate files owned by CompactionOutputs + std::vector compaction_output_files_; + std::vector proximal_level_compaction_output_files_; + + SubcompactionProgress CreateSubcompactionProgress( + const std::string& next_key, uint64_t num_processed_input_records, + uint64_t num_processed_output_records, + uint64_t num_processed_proximal_level_output_records, + const std::vector& output_file_numbers = {}, + const std::vector& proximal_file_numbers = {}, + const std::string& file_prefix = "file_") { + SubcompactionProgress progress; + progress.next_internal_key_to_compact = next_key; + progress.num_processed_input_records = num_processed_input_records; + progress.output_level_progress.SetNumProcessedOutputRecords( + num_processed_output_records); + progress.proximal_output_level_progress.SetNumProcessedOutputRecords( + num_processed_proximal_level_output_records); + + for (uint64_t file_num : output_file_numbers) { + FileMetaData file = CreateTestFile(file_num, file_prefix + "output_"); + progress.output_level_progress.AddToOutputFiles(file); + } + + for (uint64_t file_num : proximal_file_numbers) { + FileMetaData file = CreateTestFile(file_num, file_prefix + "proximal_"); + progress.proximal_output_level_progress.AddToOutputFiles(file); + } + + return progress; + } + + std::pair + EncodeDecodeProgress(const SubcompactionProgress& progress) { + VersionEdit edit; + edit.SetSubcompactionProgress(progress); + + std::string encoded; + EXPECT_TRUE(edit.EncodeTo(&encoded, 0 /* ts_sz */)); + + VersionEdit decoded_edit; + EXPECT_OK(decoded_edit.DecodeFrom(encoded)); + EXPECT_TRUE(decoded_edit.HasSubcompactionProgress()); + + SubcompactionProgress decoded_progress = + decoded_edit.GetSubcompactionProgress(); + + return {std::move(decoded_edit), std::move(decoded_progress)}; + } + + void VerifyFileMetaDataEquality(const FileMetaData& expected, + const FileMetaData& actual) { + // Verify the major fields only + ASSERT_EQ(actual.fd.GetNumber(), expected.fd.GetNumber()); + ASSERT_EQ(actual.fd.GetFileSize(), expected.fd.GetFileSize()); + ASSERT_EQ(actual.smallest.Encode(), expected.smallest.Encode()); + ASSERT_EQ(actual.largest.Encode(), expected.largest.Encode()); + ASSERT_EQ(actual.oldest_ancester_time, expected.oldest_ancester_time); + ASSERT_EQ(actual.file_creation_time, expected.file_creation_time); + ASSERT_EQ(actual.epoch_number, expected.epoch_number); + ASSERT_EQ(actual.file_checksum, expected.file_checksum); + ASSERT_EQ(actual.file_checksum_func_name, expected.file_checksum_func_name); + ASSERT_EQ(actual.marked_for_compaction, expected.marked_for_compaction); + ASSERT_EQ(actual.temperature, expected.temperature); + } + + void VerifyProgressEquality(const SubcompactionProgress& expected, + const SubcompactionProgress& actual) { + ASSERT_EQ(actual.next_internal_key_to_compact, + expected.next_internal_key_to_compact); + + ASSERT_EQ(actual.num_processed_input_records, + expected.num_processed_input_records); + + for (const bool is_proximal_level : {false, true}) { + const SubcompactionProgressPerLevel& + actual_subcompaction_progress_by_level = + is_proximal_level ? actual.proximal_output_level_progress + : actual.output_level_progress; + + const SubcompactionProgressPerLevel& + expected_subcompaction_progress_by_level = + is_proximal_level ? expected.proximal_output_level_progress + : expected.output_level_progress; + + ASSERT_EQ( + actual_subcompaction_progress_by_level.GetNumProcessedOutputRecords(), + expected_subcompaction_progress_by_level + .GetNumProcessedOutputRecords()); + + ASSERT_EQ( + actual_subcompaction_progress_by_level.GetOutputFiles().size(), + expected_subcompaction_progress_by_level.GetOutputFiles().size()); + + for (size_t i = 0; + i < expected_subcompaction_progress_by_level.GetOutputFiles().size(); + ++i) { + VerifyFileMetaDataEquality( + expected_subcompaction_progress_by_level.GetOutputFiles()[i], + actual_subcompaction_progress_by_level.GetOutputFiles()[i]); + } + } + } +}; + +const std::string SubcompactionProgressTest::kTestChecksumFuncName = "crc32c"; + +TEST_F(SubcompactionProgressTest, BasicEncodeDecode) { + // Create progress with files for both levels + SubcompactionProgress progress = CreateSubcompactionProgress( + "key_100", // next_internal_key_to_compact + 500, // num_processed_input_records + 400, // num_processed_output_records + 100, // num_processed_proximal_level_output_records + {1}, // output_file_numbers + {2}, // proximal_file_numbers + "test_" // file_prefix + ); + + auto [ignored, decoded_progress] = EncodeDecodeProgress(progress); + + VerifyProgressEquality(progress, decoded_progress); +} + +TEST_F(SubcompactionProgressTest, OutputFilesDeltaEncodeDecode) { + // Test Delta Encoding/Decoding + SubcompactionProgress initial_progress = CreateSubcompactionProgress( + "key_100", // next_internal_key_to_compact + 100, // num_processed_input_records + 40, // num_processed_output_records + 60, // num_processed_proximal_level_output_records + {1}, // output_file_numbers + {2}, // proximal_file_numbers + "initial_" // file_prefix + ); + + auto [initial_decoded_edit, ignored_1] = + EncodeDecodeProgress(initial_progress); + initial_progress.output_level_progress.UpdateLastPersistedOutputFilesCount(); + initial_progress.proximal_output_level_progress + .UpdateLastPersistedOutputFilesCount(); + + // Add one new output file to output and proximal level + SubcompactionProgress updated_progress = initial_progress; + updated_progress.next_internal_key_to_compact = "key_300"; + updated_progress.num_processed_input_records = 1000; + updated_progress.output_level_progress.SetNumProcessedOutputRecords(400); + updated_progress.proximal_output_level_progress.SetNumProcessedOutputRecords( + 600); + + FileMetaData new_file = CreateTestFile(3, "new_"); + updated_progress.output_level_progress.AddToOutputFiles(new_file); + + FileMetaData new_file_proximal = CreateTestFile(4, "new_"); + updated_progress.proximal_output_level_progress.AddToOutputFiles( + new_file_proximal); + + auto [delta_decoded_edit, delta_decoded_progress] = + EncodeDecodeProgress(updated_progress); + + ASSERT_EQ(delta_decoded_progress.next_internal_key_to_compact, + updated_progress.next_internal_key_to_compact); + + ASSERT_EQ(delta_decoded_progress.num_processed_input_records, + updated_progress.num_processed_input_records); + + for (const bool& is_proximal_level : {false, true}) { + const SubcompactionProgressPerLevel& delta_progress_per_level = + is_proximal_level + ? delta_decoded_progress.proximal_output_level_progress + : delta_decoded_progress.output_level_progress; + + const SubcompactionProgressPerLevel& updated_progress_per_level = + is_proximal_level ? updated_progress.proximal_output_level_progress + : updated_progress.output_level_progress; + + ASSERT_EQ(delta_progress_per_level.GetNumProcessedOutputRecords(), + updated_progress_per_level.GetNumProcessedOutputRecords()); + + // Only the newly added file since last persistence should be present + ASSERT_EQ(delta_progress_per_level.GetOutputFiles().size(), 1); + + ASSERT_EQ(delta_progress_per_level.GetOutputFiles()[0].fd.GetNumber(), + is_proximal_level ? new_file_proximal.fd.GetNumber() + : new_file.fd.GetNumber()); + } + + // Test SubcompactionProgressBuilder + SubcompactionProgressBuilder builder; + ASSERT_FALSE(builder.HasAccumulatedSubcompactionProgress()); + + ASSERT_TRUE(builder.ProcessVersionEdit(initial_decoded_edit)); + ASSERT_TRUE(builder.HasAccumulatedSubcompactionProgress()); + ASSERT_TRUE(builder.ProcessVersionEdit(delta_decoded_edit)); + + const auto& accumulated_progress = + builder.GetAccumulatedSubcompactionProgress(); + + ASSERT_EQ(accumulated_progress.next_internal_key_to_compact, + updated_progress.next_internal_key_to_compact); + + ASSERT_EQ(accumulated_progress.num_processed_input_records, + updated_progress.num_processed_input_records); + + for (const bool& is_proximal_level : {false, true}) { + const SubcompactionProgressPerLevel& accumulated_progress_per_level = + is_proximal_level ? accumulated_progress.proximal_output_level_progress + : accumulated_progress.output_level_progress; + + const SubcompactionProgressPerLevel& updated_progress_per_level = + is_proximal_level ? updated_progress.proximal_output_level_progress + : updated_progress.output_level_progress; + + ASSERT_EQ(accumulated_progress_per_level.GetNumProcessedOutputRecords(), + updated_progress_per_level.GetNumProcessedOutputRecords()); + + ASSERT_EQ(accumulated_progress_per_level.GetOutputFiles().size(), + updated_progress_per_level.GetOutputFiles().size()); + + std::set accumulated_file_numbers; + for (const auto& file : accumulated_progress_per_level.GetOutputFiles()) { + accumulated_file_numbers.insert(file.fd.GetNumber()); + } + + std::set expected_file_numbers; + for (const auto& file : updated_progress_per_level.GetOutputFiles()) { + expected_file_numbers.insert(file.fd.GetNumber()); + } + + ASSERT_EQ(accumulated_file_numbers, expected_file_numbers); + } + + // ===== PART 3: Test Builder Reset ===== + builder.Clear(); + ASSERT_FALSE(builder.HasAccumulatedSubcompactionProgress()); +} + +TEST_F(SubcompactionProgressTest, UnknownTags) { + SubcompactionProgress progress; + std::string encoded; + + // 1. Test unknown ignorable tag + progress.next_internal_key_to_compact = "test_key"; + progress.num_processed_input_records = 100; + + PutVarint32(&encoded, + SubcompactionProgressCustomTag::kNextInternalKeyToCompact); + PutLengthPrefixedSlice(&encoded, progress.next_internal_key_to_compact); + + PutVarint32(&encoded, + SubcompactionProgressCustomTag::kNumProcessedInputRecords); + std::string varint_records; + PutVarint64(&varint_records, progress.num_processed_input_records); + PutLengthPrefixedSlice(&encoded, varint_records); + + // Manually encode with unknown ignorable tag (has + // SubcompactionProgressCustomTag::kSubcompactionProgressCustomTagSafeIgnoreMask + // bit set) + uint32_t unknown_ignorable_tag = + SubcompactionProgressCustomTag:: + kSubcompactionProgressCustomTagSafeIgnoreMask + + 1; + PutVarint32(&encoded, unknown_ignorable_tag); + PutLengthPrefixedSlice(&encoded, "future_data"); + + PutVarint32(&encoded, + SubcompactionProgressCustomTag::kSubcompactionProgressTerminate); + + // Test decoding - should succeed and ignore unknown tag + Slice input(encoded); + SubcompactionProgress decoded_progress; + Status s = decoded_progress.DecodeFrom(&input); + ASSERT_OK(s); + + // Verify known fields are preserved + ASSERT_EQ(decoded_progress.next_internal_key_to_compact, + progress.next_internal_key_to_compact); + ASSERT_EQ(decoded_progress.num_processed_input_records, + progress.num_processed_input_records); + + // 2. Test unknown non-ignorable tag + encoded.clear(); + PutVarint32(&encoded, + SubcompactionProgressCustomTag::kNextInternalKeyToCompact); + PutLengthPrefixedSlice(&encoded, "test_key"); + + // Manually encode with unknown non-ignorable tag (do not have + // SubcompactionProgressCustomTag::kSubcompactionProgressCustomTagSafeIgnoreMask + // bit set) + uint32_t unknown_critical_tag = + SubcompactionProgressCustomTag:: + kSubcompactionProgressCustomTagSafeIgnoreMask - + 1; + PutVarint32(&encoded, unknown_critical_tag); + PutLengthPrefixedSlice(&encoded, "critical_future_data"); + PutVarint32(&encoded, + SubcompactionProgressCustomTag::kSubcompactionProgressTerminate); + + // Test decoding - should fail on critical unknown tag + Slice critical_input(encoded); + SubcompactionProgress critical_progress; + Status critical_status = critical_progress.DecodeFrom(&critical_input); + ASSERT_NOK(critical_status); + ASSERT_TRUE(critical_status.IsNotSupported()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/version_set.cc b/db/version_set.cc index b560713cbbab..6c9cbc82a17c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +96,8 @@ namespace ROCKSDB_NAMESPACE { namespace { +using ScanOptionsMap = std::unordered_map; + // Find File in LevelFilesBrief data structure // Within an index range defined by left and right int FindFileInRange(const InternalKeyComparator& icmp, @@ -978,7 +981,8 @@ class LevelIterator final : public InternalIterator { nullptr, bool allow_unprepared_value = false, std::unique_ptr*** range_tombstone_iter_ptr_ = - nullptr) + nullptr, + Statistics* db_statistics = nullptr, SystemClock* clock = nullptr) : table_cache_(table_cache), read_options_(read_options), file_options_(file_options), @@ -1002,7 +1006,10 @@ class LevelIterator final : public InternalIterator { skip_filters_(skip_filters), allow_unprepared_value_(allow_unprepared_value), is_next_read_sequential_(false), - to_return_sentinel_(false) { + to_return_sentinel_(false), + scan_opts_(nullptr), + db_statistics_(db_statistics), + clock_(clock) { // Empty level is not supported. assert(flevel_ != nullptr && flevel_->num_files > 0); if (range_tombstone_iter_ptr_) { @@ -1010,7 +1017,15 @@ class LevelIterator final : public InternalIterator { } } - ~LevelIterator() override { delete file_iter_.Set(nullptr); } + ~LevelIterator() override { + delete file_iter_.Set(nullptr); + // Clean up any prepared iterators that weren't used + for (auto& entry : prepared_iters_) { + delete entry.second; + } + prepared_iters_.clear(); + assert(prepared_iters_.size() == 0); + } // Seek to the first file with a key >= target. // If range_tombstone_iter_ is not nullptr, then we pretend that file @@ -1098,6 +1113,136 @@ class LevelIterator final : public InternalIterator { read_seq_ = read_seq; } + inline bool FileHasMultiScanArg(size_t file_index) { + if (file_to_scan_opts_.get()) { + auto it = file_to_scan_opts_->find(file_index); + if (it != file_to_scan_opts_->end()) { + return !it->second.empty(); + } + } + return false; + } + + MultiScanArgs& GetMultiScanArgForFile(size_t file_index) { + auto multi_scan_args_it = file_to_scan_opts_->find(file_index); + if (multi_scan_args_it == file_to_scan_opts_->end()) { + auto ret = file_to_scan_opts_->emplace( + file_index, MultiScanArgs(user_comparator_.user_comparator())); + multi_scan_args_it = ret.first; + assert(ret.second); + } + return multi_scan_args_it->second; + } + + void Prepare(const MultiScanArgs* so) override { + // We assume here that scan_opts is sorted such that + // scan_opts[0].range.start < scan_opts[1].range.start, and non + // overlapping + if (so == nullptr) { + return; + } + + scan_opts_ = so; + + // Verify comparator is consistent + assert(so->GetComparator() == user_comparator_.user_comparator()); + + file_to_scan_opts_ = std::make_unique(); + for (size_t k = 0; k < scan_opts_->size(); k++) { + const ScanOptions& opt = scan_opts_->GetScanRanges().at(k); + auto start = opt.range.start; + auto end = opt.range.limit; + + if (!start.has_value()) { + continue; + } + + // We can capture this case in the future, but for now lets skip this. + if (!end.has_value()) { + continue; + } + + const size_t timestamp_size = + user_comparator_.user_comparator()->timestamp_size(); + InternalKey istart, iend; + if (timestamp_size == 0) { + istart = + InternalKey(start.value(), kMaxSequenceNumber, kValueTypeForSeek); + // end key is exclusive for multiscan + iend = InternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek); + } else { + std::string start_key_with_ts, end_key_with_ts; + AppendKeyWithMaxTimestamp(&start_key_with_ts, start.value(), + timestamp_size); + AppendKeyWithMaxTimestamp(&end_key_with_ts, end.value(), + timestamp_size); + istart = InternalKey(start_key_with_ts, kMaxSequenceNumber, + kValueTypeForSeek); + // end key is exclusive for multiscan + iend = + InternalKey(end_key_with_ts, kMaxSequenceNumber, kValueTypeForSeek); + } + + // TODO: This needs to be optimized, right now we iterate twice, which + // we dont need to. We can do this in N rather than 2N. + size_t fstart = FindFile(icomparator_, *flevel_, istart.Encode()); + size_t fend = FindFile(icomparator_, *flevel_, iend.Encode()); + + // We need to check the relevant cases + // Cases: + // 1. [ S E ] + // 2. [ S ] [ E ] + // 3. [ S ] ...... [ E ] + for (auto i = fstart; i <= fend; i++) { + if (i < flevel_->num_files) { + // FindFile only compares against the largest_key, so we need this + // additional check to ensure the scan range overlaps the file + if (icomparator_.InternalKeyComparator::Compare( + iend.Encode(), flevel_->files[i].smallest_key) < 0) { + continue; + } + auto const metadata = flevel_->files[i].file_metadata; + if (metadata->FileIsStandAloneRangeTombstone()) { + // Skip stand alone range deletion files. + continue; + } + auto& args = GetMultiScanArgForFile(i); + args.insert(start.value(), end.value(), opt.property_bag); + } + } + } + + StopWatch timer(clock_, db_statistics_, MULTISCAN_PREPARE_ITERATORS); + + // Propagate multiscan configs + for (auto& file_to_arg : *file_to_scan_opts_) { + file_to_arg.second.CopyConfigFrom(*so); + assert(OverlapRange(*file_to_arg.second.GetScanRanges().begin(), + file_to_arg.first) && + OverlapRange(*file_to_arg.second.GetScanRanges().rbegin(), + file_to_arg.first)); + } + + if (so->use_async_io) { + auto before = file_index_; + // Pre-create and prepare only relevant file iterators + for (auto& file_to_arg : *file_to_scan_opts_) { + size_t file_index = file_to_arg.first; + + file_index_ = file_index; + // Create iterator for this file + auto iter = NewFileIterator(); + if (iter != nullptr) { + // If we have async enabled, lets prepare all our iterators. + iter->Prepare(&file_to_arg.second); + // Store the prepared iterator + prepared_iters_[file_index] = iter; + } + } + file_index_ = before; + } + } + private: // Return true if at least one invalid file is seen and skipped. bool SkipEmptyFileForward(); @@ -1170,6 +1315,10 @@ class LevelIterator final : public InternalIterator { } } +#ifndef NDEBUG + bool OverlapRange(const ScanOptions& opts, size_t file_index); +#endif + TableCache* table_cache_; const ReadOptions& read_options_; const FileOptions& file_options_; @@ -1223,6 +1372,16 @@ class LevelIterator final : public InternalIterator { bool prefix_exhausted_ = false; // Whether next/prev key is a sentinel key. bool to_return_sentinel_ = false; + const MultiScanArgs* scan_opts_ = nullptr; + + Statistics* db_statistics_ = nullptr; + SystemClock* clock_ = nullptr; + + // Our stored scan_opts for each prefix + std::unique_ptr file_to_scan_opts_ = nullptr; + + // Map to store pre-created iterators by file index + std::unordered_map prepared_iters_; // Sets flags for if we should return the sentinel key next. // The condition for returning sentinel is reaching the end of current @@ -1263,6 +1422,14 @@ void LevelIterator::Seek(const Slice& target) { } if (file_iter_.iter() != nullptr) { + if (scan_opts_) { + // At this point, we only know that the seek target is < largest_key + // in the file. We need to check whether there is actual overlap. + const FdWithKeyRange& cur_file = flevel_->files[file_index_]; + if (KeyReachedUpperBound(cur_file.smallest_key)) { + return; + } + } file_iter_.Seek(target); // Status::TryAgain indicates asynchronous request for retrieval of data // blocks has been submitted. So it should return at this point and Seek @@ -1485,7 +1652,31 @@ bool LevelIterator::SkipEmptyFileForward() { // LevelIterator::Seek*, it should also call Seek* into the corresponding // range tombstone iterator. if (file_iter_.iter() != nullptr) { - file_iter_.SeekToFirst(); + // If we are doing prepared scan opts then we should seek to the values + // specified by the scan opts + + if (scan_opts_ && FileHasMultiScanArg(file_index_)) { + const ScanOptions& opts = + GetMultiScanArgForFile(file_index_).GetScanRanges().front(); + if (opts.range.start.has_value()) { + InternalKey target; + const size_t ts_size = + user_comparator_.user_comparator()->timestamp_size(); + if (ts_size == 0) { + target = InternalKey(opts.range.start.value(), kMaxSequenceNumber, + kValueTypeForSeek); + } else { + std::string seek_key; + AppendKeyWithMaxTimestamp(&seek_key, opts.range.start.value(), + ts_size); + target = + InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek); + } + file_iter_.Seek(target.Encode()); + } + } else { + file_iter_.SeekToFirst(); + } if (range_tombstone_iter_) { if (*range_tombstone_iter_) { (*range_tombstone_iter_)->SeekToFirst(); @@ -1527,13 +1718,25 @@ void LevelIterator::SkipEmptyFileBackward() { } } +#ifndef NDEBUG +bool LevelIterator::OverlapRange(const ScanOptions& opts, size_t file_index) { + return (user_comparator_.CompareWithoutTimestamp( + opts.range.start.value(), /*a_has_ts=*/false, + ExtractUserKey(flevel_->files[file_index].largest_key), + /*b_has_ts=*/true) <= 0 && + user_comparator_.CompareWithoutTimestamp( + opts.range.limit.value(), /*a_has_ts=*/false, + ExtractUserKey(flevel_->files[file_index].smallest_key), + /*b_has_ts=*/true) > 0); +} +#endif + void LevelIterator::SetFileIterator(InternalIterator* iter) { if (pinned_iters_mgr_ && iter) { iter->SetPinnedItersMgr(pinned_iters_mgr_); } InternalIterator* old_iter = file_iter_.Set(iter); - // Update the read pattern for PrefetchBuffer. if (is_next_read_sequential_) { file_iter_.UpdateReadaheadState(old_iter); @@ -1563,11 +1766,29 @@ void LevelIterator::InitFileIterator(size_t new_file_index) { // no need to change anything } else { file_index_ = new_file_index; + if (!prepared_iters_.empty()) { + auto prepared_it = prepared_iters_.find(file_index_); + if (prepared_it != prepared_iters_.end()) { + InternalIterator* iter = prepared_it->second; + prepared_iters_.erase(prepared_it); + SetFileIterator(iter); + return; + } + } + InternalIterator* iter = NewFileIterator(); + if (FileHasMultiScanArg(file_index_)) { + auto& args = GetMultiScanArgForFile(file_index_); + assert(OverlapRange(*args.GetScanRanges().begin(), file_index_) && + OverlapRange(*args.GetScanRanges().rbegin(), file_index_)); + iter->Prepare(&args); + } + SetFileIterator(iter); } } } + } // anonymous namespace Status Version::GetTableProperties(const ReadOptions& read_options, @@ -1599,8 +1820,10 @@ Status Version::GetTableProperties(const ReadOptions& read_options, file_name = TableFileName(ioptions.cf_paths, file_meta->fd.GetNumber(), file_meta->fd.GetPathId()); } - s = ioptions.fs->NewRandomAccessFile(file_name, file_options_, &file, - nullptr); + FileOptions fopts = file_options_; + fopts.file_checksum = file_meta->file_checksum; + fopts.file_checksum_func_name = file_meta->file_checksum_func_name; + s = ioptions.fs->NewRandomAccessFile(file_name, fopts, &file, nullptr); if (!s.ok()) { return s; } @@ -1627,8 +1850,8 @@ Status Version::GetTableProperties(const ReadOptions& read_options, return s; } -Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, - TablePropertiesCollection* props) { +Status Version::GetPropertiesOfAllTables( + const ReadOptions& read_options, TablePropertiesCollection* props) const { Status s; for (int level = 0; level < storage_info_.num_levels_; level++) { s = GetPropertiesOfAllTables(read_options, props, level); @@ -1699,7 +1922,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, TablePropertiesCollection* props, - int level) { + int level) const { for (const auto& file_meta : storage_info_.files_[level]) { auto fname = TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(), @@ -1753,6 +1976,24 @@ Status Version::GetPropertiesOfTablesInRange( return Status::OK(); } +Status Version::GetPropertiesOfTablesByLevel( + const ReadOptions& read_options, + std::vector>* props_by_level) + const { + Status s; + + props_by_level->reserve(storage_info_.num_levels_); + for (int level = 0; level < storage_info_.num_levels_; level++) { + props_by_level->push_back(std::make_unique()); + s = GetPropertiesOfAllTables(read_options, props_by_level->back().get(), + level); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + Status Version::GetAggregatedTableProperties( const ReadOptions& read_options, std::shared_ptr* tp, int level) { @@ -1850,6 +2091,79 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { } } +void Version::GetColumnFamilyMetaData( + const GetColumnFamilyMetaDataOptions& options, + ColumnFamilyMetaData* cf_meta) { + assert(cf_meta); + assert(cfd_); + + cf_meta->name = cfd_->GetName(); + cf_meta->size = 0; + cf_meta->file_count = 0; + cf_meta->levels.clear(); + cf_meta->blob_file_size = 0; + cf_meta->blob_file_count = 0; + cf_meta->blob_files.clear(); + + const auto& ioptions = cfd_->ioptions(); + auto* vstorage = storage_info(); + + int first_level = (options.level >= 0) ? options.level : 0; + int last_level = + (options.level >= 0) ? options.level + 1 : cfd_->NumberLevels(); + + InternalKey ikey_start, ikey_end; + const InternalKey* begin = nullptr; + const InternalKey* end = nullptr; + + if (options.range.start.has_value()) { + ikey_start = InternalKey(options.range.start.value(), kMaxSequenceNumber, + kValueTypeForSeek); + begin = &ikey_start; + } + + if (options.range.limit.has_value()) { + ikey_end = InternalKey(options.range.limit.value(), kMaxSequenceNumber, + kValueTypeForSeek); + end = &ikey_end; + } + + for (int l = first_level; l < last_level; ++l) { + uint64_t level_size = 0; + std::vector files; + std::vector overlapping_files; + vstorage->GetOverlappingInputs(l, begin, end, &overlapping_files); + + for (const auto& file : overlapping_files) { + uint32_t path_id = file->fd.GetPathId(); + const auto& file_path = (path_id < ioptions.cf_paths.size()) + ? ioptions.cf_paths[path_id].path + : ioptions.cf_paths.back().path; + const uint64_t file_number = file->fd.GetNumber(); + files.emplace_back( + MakeTableFileName("", file_number), file_number, file_path, + file->fd.GetFileSize(), file->fd.smallest_seqno, + file->fd.largest_seqno, file->smallest.user_key().ToString(), + file->largest.user_key().ToString(), + file->stats.num_reads_sampled.load(std::memory_order_relaxed), + file->being_compacted, file->temperature, + file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), + file->TryGetFileCreationTime(), file->epoch_number, + file->file_checksum, file->file_checksum_func_name); + files.back().num_entries = file->num_entries; + files.back().num_deletions = file->num_deletions; + files.back().smallest = file->smallest.Encode().ToString(); + files.back().largest = file->largest.Encode().ToString(); + level_size += file->fd.GetFileSize(); + cf_meta->file_count++; + } + if (!files.empty()) { + cf_meta->levels.emplace_back(l, level_size, std::move(files)); + cf_meta->size += level_size; + } + } +} + uint64_t Version::GetSstFilesSize() { uint64_t sst_files_size = 0; for (int level = 0; level < storage_info_.num_levels_; level++) { @@ -1934,7 +2248,7 @@ InternalIterator* Version::TEST_GetLevelIterator( cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, - allow_unprepared_value, &tombstone_iter_ptr); + allow_unprepared_value, &tombstone_iter_ptr, db_statistics_, clock_); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(level_iter); } else { @@ -2074,7 +2388,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr, allow_unprepared_value, - &tombstone_iter_ptr); + &tombstone_iter_ptr, db_statistics_, clock_); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(level_iter); } else { @@ -2131,7 +2445,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, mutable_cf_options_, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - &range_del_agg, nullptr, false)); + &range_del_agg, nullptr, false, nullptr, db_statistics_, clock_)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); } @@ -2709,9 +3023,10 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size()); // Collect all results so far - std::vector statuses = folly::coro::blockingWait( - folly::coro::collectAllRange(std::move(mget_tasks)) - .scheduleOn(&range->context()->executor())); + std::vector statuses = + folly::coro::blockingWait(co_withExecutor( + &range->context()->executor(), + folly::coro::collectAllRange(std::move(mget_tasks)))); if (s.ok()) { for (Status stat : statuses) { if (!stat.ok()) { @@ -2996,9 +3311,10 @@ Status Version::MultiGetAsync( assert(waiting.size()); RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size()); // Collect all results so far - std::vector statuses = folly::coro::blockingWait( - folly::coro::collectAllRange(std::move(mget_tasks)) - .scheduleOn(&range->context()->executor())); + std::vector statuses = + folly::coro::blockingWait(co_withExecutor( + &range->context()->executor(), + folly::coro::collectAllRange(std::move(mget_tasks)))); mget_tasks.clear(); if (s.ok()) { for (Status stat : statuses) { @@ -3130,7 +3446,7 @@ bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options, // Ensure new invariants on old files file_meta->num_deletions = std::max(tp->num_deletions, tp->num_range_deletions); - file_meta->num_entries = std::max(tp->num_entries, tp->num_deletions); + file_meta->num_entries = std::max(tp->num_entries, file_meta->num_deletions); return true; } @@ -3423,7 +3739,8 @@ bool ShouldChangeFileTemperature(const ImmutableOptions& ioptions, void VersionStorageInfo::ComputeCompactionScore( const ImmutableOptions& immutable_options, - const MutableCFOptions& mutable_cf_options) { + const MutableCFOptions& mutable_cf_options, + const std::string& full_history_ts_low) { double total_downcompact_bytes = 0.0; // Historically, score is defined as actual bytes in a level divided by // the level's target size, and 1.0 is the threshold for triggering @@ -3434,7 +3751,9 @@ void VersionStorageInfo::ComputeCompactionScore( // maintaining it to be over 1.0, we scale the original score by 10x // if it is larger than 1.0. const double kScoreScale = 10.0; - int max_output_level = MaxOutputLevel(immutable_options.allow_ingest_behind); + int max_output_level = + MaxOutputLevel(immutable_options.cf_allow_ingest_behind || + immutable_options.allow_ingest_behind); for (int level = 0; level <= MaxInputLevel(); level++) { double score; if (level == 0) { @@ -3475,10 +3794,20 @@ void VersionStorageInfo::ComputeCompactionScore( } if (compaction_style_ == kCompactionStyleFIFO) { - score = static_cast(total_size) / - mutable_cf_options.compaction_options_fifo.max_table_files_size; - if (score < 1 && - mutable_cf_options.compaction_options_fifo.allow_compaction) { + const auto& fifo_opts = mutable_cf_options.compaction_options_fifo; + uint64_t effective_size = total_size; + uint64_t effective_max = fifo_opts.max_table_files_size; + if (fifo_opts.max_data_files_size > 0) { + // Blob-aware: include blob file sizes in the total + effective_size += GetBlobStats().total_file_size; + effective_max = fifo_opts.max_data_files_size; + } + if (effective_max == 0) { + // avoid divide 0 + effective_max = 1; + } + score = static_cast(effective_size) / effective_max; + if (score < 1 && fifo_opts.allow_compaction) { score = std::max( static_cast(num_sorted_runs) / mutable_cf_options.level0_file_num_compaction_trigger, @@ -3614,7 +3943,9 @@ void VersionStorageInfo::ComputeCompactionScore( } ComputeFilesMarkedForCompaction(max_output_level); ComputeBottommostFilesMarkedForCompaction( - immutable_options.allow_ingest_behind); + immutable_options.cf_allow_ingest_behind || + immutable_options.allow_ingest_behind, + immutable_options.user_comparator, full_history_ts_low); ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); ComputeFilesMarkedForPeriodicCompaction( immutable_options, mutable_cf_options.periodic_compaction_seconds, @@ -4205,17 +4536,20 @@ void VersionStorageInfo::GenerateFileLocationIndex() { } } -void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum, - bool allow_ingest_behind) { +void VersionStorageInfo::UpdateOldestSnapshot( + SequenceNumber seqnum, bool allow_ingest_behind, const Comparator* ucmp, + const std::string& full_history_ts_low) { assert(seqnum >= oldest_snapshot_seqnum_); oldest_snapshot_seqnum_ = seqnum; if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) { - ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind); + ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind, ucmp, + full_history_ts_low); } } void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction( - bool allow_ingest_behind) { + bool allow_ingest_behind, const Comparator* ucmp, + const std::string& full_history_ts_low) { bottommost_files_marked_for_compaction_.clear(); bottommost_files_mark_threshold_ = kMaxSequenceNumber; if (allow_ingest_behind) { @@ -4236,12 +4570,39 @@ void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction( current_time - static_cast(bottommost_file_compaction_delay_); } + // For UDT, we need to check if the file's max timestamp is below + // full_history_ts_low. If not, the compaction won't be able to collapse the + // timestamp to clean up the tombstone , so marking the file would be futile + // and could cause an infinite compaction loop. + const bool has_udt = ucmp && ucmp->timestamp_size() > 0; + for (auto& level_and_file : bottommost_files_) { if (!level_and_file.second->being_compacted && level_and_file.second->fd.largest_seqno != 0) { // largest_seqno might be nonzero due to containing the final key in an // earlier compaction, whose seqnum we didn't zero out. if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) { + if (has_udt) { + const std::string& max_ts = level_and_file.second->max_timestamp; + // If max_timestamp is empty, the file could come from very old + // version which does not have timestamp. In that case, we should pick + // the file for compaction. After compaction, the file will have + // max_timestamp set propertly. + if (!max_ts.empty()) { + // If full_history_ts_low is empty, it means it was never set, which + // means its value is 0. Therefore, it would be always smaller than + // max_timestamp + if (full_history_ts_low.empty()) { + continue; + } + // If max timestamp >= full_history_ts_low, skip this file + if (ucmp->CompareTimestamp(Slice(max_ts), full_history_ts_low) >= + 0) { + continue; + } + } + } + if (!needs_delay) { bottommost_files_marked_for_compaction_.push_back(level_and_file); } else if (creation_time_ub > 0) { @@ -4303,7 +4664,8 @@ bool VersionStorageInfo::OverlapInLevel(int level, void VersionStorageInfo::GetOverlappingInputs( int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index, - bool expand_range, InternalKey** next_smallest) const { + bool expand_range, const FileMetaData* starting_l0_file, + InternalKey** next_smallest) const { if (level >= num_non_empty_levels_) { // this level is empty, no overlapping inputs return; @@ -4336,7 +4698,19 @@ void VersionStorageInfo::GetOverlappingInputs( // index stores the file index need to check. std::list index; - for (size_t i = 0; i < level_files_brief_[level].num_files; i++) { + size_t start_index = 0; + if (starting_l0_file != nullptr) { + uint64_t starting_file_number = starting_l0_file->fd.GetNumber(); + for (size_t i = 0; i < level_files_brief_[level].num_files; i++) { + if (level_files_brief_[level].files[i].fd.GetNumber() == + starting_file_number) { + start_index = i; + break; + } + } + assert(start_index < level_files_brief_[level].num_files); + } + for (size_t i = start_index; i < level_files_brief_[level].num_files; i++) { index.emplace_back(i); } @@ -4611,8 +4985,7 @@ void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd, if (restart_epoch) { cfd->ResetNextEpochNumber(); - bool reserve_epoch_num_for_file_ingested_behind = - cfd->ioptions().allow_ingest_behind; + bool reserve_epoch_num_for_file_ingested_behind = cfd->AllowIngestBehind(); if (reserve_epoch_num_for_file_ingested_behind) { uint64_t reserved_epoch_number = cfd->NewEpochNumber(); assert(reserved_epoch_number == @@ -4620,7 +4993,8 @@ void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd, ROCKS_LOG_INFO(cfd->ioptions().info_log.get(), "[%s]CF has reserved epoch number %" PRIu64 " for files ingested " - "behind since `Options::allow_ingest_behind` is true", + "behind since `Options::allow_ingest_behind` or " + "`Options::cf_allow_ingest_behind` is true", cfd->GetName().c_str(), reserved_epoch_number); } } @@ -4761,7 +5135,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, cur_level_size <= base_bytes_min && (options.preclude_last_level_data_seconds == 0 || i < num_levels_ - 2)) { - // When per_key_placement is enabled, the penultimate level is + // When per_key_placement is enabled, the proximal level is // necessary. lowest_unnecessary_level_ = i; } @@ -4903,24 +5277,38 @@ bool VersionStorageInfo::RangeMightExistAfterSortedRun( } Env::WriteLifeTimeHint VersionStorageInfo::CalculateSSTWriteHint( - int level) const { - if (compaction_style_ != kCompactionStyleLevel) { + int level, CompactionStyleSet compaction_style_set) const { + if (!compaction_style_set.Contains(compaction_style_)) { return Env::WLTH_NOT_SET; } - if (level == 0) { - return Env::WLTH_MEDIUM; - } - // L1: medium, L2: long, ... - if (level - base_level_ >= 2) { - return Env::WLTH_EXTREME; - } else if (level < base_level_) { - // There is no restriction which prevents level passed in to be smaller - // than base_level. - return Env::WLTH_MEDIUM; + switch (compaction_style_) { + case kCompactionStyleLevel: + if (level == 0) { + return Env::WLTH_MEDIUM; + } + + // L1: medium, L2: long, ... + if (level - base_level_ >= 2) { + return Env::WLTH_EXTREME; + } else if (level < base_level_) { + // There is no restriction which prevents level passed in to be smaller + // than base_level. + return Env::WLTH_MEDIUM; + } + return static_cast( + level - base_level_ + static_cast(Env::WLTH_MEDIUM)); + case kCompactionStyleUniversal: + if (level == 0) { + return Env::WLTH_SHORT; + } + if (level == 1) { + return Env::WLTH_MEDIUM; + } + return Env::WLTH_LONG; + default: + return Env::WLTH_NOT_SET; } - return static_cast( - level - base_level_ + static_cast(Env::WLTH_MEDIUM)); } void Version::AddLiveFiles(std::vector* live_table_files, @@ -5109,12 +5497,13 @@ void AtomicGroupReadBuffer::Clear() { VersionSet::VersionSet( const std::string& dbname, const ImmutableDBOptions* _db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& storage_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, WriteController* write_controller, BlockCacheTracer* const block_cache_tracer, const std::shared_ptr& io_tracer, const std::string& db_id, const std::string& db_session_id, const std::string& daily_offpeak_time_utc, - ErrorHandler* const error_handler, const bool read_only) + ErrorHandler* error_handler, bool unchanging) : column_family_set_(new ColumnFamilySet( dbname, _db_options, storage_options, table_cache, write_buffer_manager, write_controller, block_cache_tracer, io_tracer, @@ -5137,18 +5526,21 @@ VersionSet::VersionSet( prev_log_number_(0), current_version_number_(0), manifest_file_size_(0), + last_compacted_manifest_file_size_(0), file_options_(storage_options), block_cache_tracer_(block_cache_tracer), io_tracer_(io_tracer), db_session_id_(db_session_id), offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)), error_handler_(error_handler), - read_only_(read_only), - closed_(false) {} + unchanging_(unchanging), + closed_(false) { + UpdatedMutableDbOptions(mutable_db_options, /*mu=*/nullptr); +} Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) { Status s; - if (closed_ || read_only_ || !manifest_file_number_ || !descriptor_log_) { + if (closed_ || unchanging_ || !manifest_file_number_ || !descriptor_log_) { return s; } @@ -5218,6 +5610,15 @@ void VersionSet::Reset() { if (column_family_set_) { WriteBufferManager* wbm = column_family_set_->write_buffer_manager(); WriteController* wc = column_family_set_->write_controller(); + + // Clear TableCache to prevent use-after-free: Reset() deletes old + // ColumnFamilySet but reuses table_cache_, which may contain + // BlockBasedTable entries with dangling references to deleted CFD's + // ioptions. + if (table_cache_) { + table_cache_->EraseUnRefEntries(); + } + // db_id becomes the source of truth after DBImpl::Recover(): // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527 // Note: we may not be able to recover db_id from MANIFEST if @@ -5240,17 +5641,45 @@ void VersionSet::Reset() { current_version_number_ = 0; manifest_writers_.clear(); manifest_file_size_ = 0; + last_compacted_manifest_file_size_ = 0; + TuneMaxManifestFileSize(); obsolete_files_.clear(); obsolete_manifests_.clear(); wals_.Reset(); } +void VersionSet::UpdatedMutableDbOptions( + const MutableDBOptions& updated_options, InstrumentedMutex* mu) { + // Must be holding mutex if not called during initialization + if (mu) { + mu->AssertHeld(); + } else { + // manifest_file_size_ must be 0 if called from the constructor + assert(manifest_file_size_ == 0); + } + file_options_.writable_file_max_buffer_size = + updated_options.writable_file_max_buffer_size; + min_max_manifest_file_size_ = updated_options.max_manifest_file_size; + max_manifest_space_amp_pct_ = static_cast( + std::max(updated_options.max_manifest_space_amp_pct, 0)); + manifest_preallocation_size_ = updated_options.manifest_preallocation_size; + TuneMaxManifestFileSize(); +} + +void VersionSet::TuneMaxManifestFileSize() { + tuned_max_manifest_file_size_ = + std::max(min_max_manifest_file_size_, + last_compacted_manifest_file_size_ * + (100U + max_manifest_space_amp_pct_) / 100U); +} + void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Version* v) { // compute new compaction score v->storage_info()->ComputeCompactionScore( column_family_data->ioptions(), - column_family_data->GetLatestMutableCFOptions()); + column_family_data->GetLatestMutableCFOptions(), + column_family_data->GetFullHistoryTsLow()); // Mark v finalized v->storage_info_.SetFinalized(); @@ -5327,8 +5756,8 @@ Status VersionSet::ProcessManifestWrites( // the preceding version edits in the same atomic group, and update // their `remaining_entries_` member variable because we are NOT going // to write the version edits' of dropped CF to the MANIFEST. If we - // don't update, then Recover can report corrupted atomic group because - // the `remaining_entries_` do not match. + // don't update, then Recover can report corrupted atomic group + // because the `remaining_entries_` do not match. if (!batch_edits.empty()) { if (batch_edits.back()->IsInAtomicGroup() && batch_edits.back()->GetRemainingEntries() > 0) { @@ -5488,10 +5917,11 @@ Status VersionSet::ProcessManifestWrites( } #endif // NDEBUG + uint64_t prev_manifest_file_size = manifest_file_size_; assert(pending_manifest_file_number_ == 0); if (!skip_manifest_write && (!descriptor_log_ || - manifest_file_size_ > db_options_->max_manifest_file_size)) { + prev_manifest_file_size >= tuned_max_manifest_file_size_)) { TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest"); new_descriptor_log = true; } else { @@ -5531,6 +5961,8 @@ Status VersionSet::ProcessManifestWrites( IOStatus manifest_io_status; manifest_io_status.PermitUncheckedError(); std::unique_ptr new_desc_log_ptr; + // Save before releasing mu + uint64_t manifest_preallocation_size = manifest_preallocation_size_; if (skip_manifest_write) { if (s.ok()) { constexpr bool update_stats = true; @@ -5574,16 +6006,13 @@ Status VersionSet::ProcessManifestWrites( // This is fine because everything inside of this block is serialized -- // only one thread can be here at the same time // create new manifest file - ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n", - pending_manifest_file_number_); std::string descriptor_fname = DescriptorFileName(dbname_, pending_manifest_file_number_); std::unique_ptr descriptor_file; io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file, opt_file_opts); if (io_s.ok()) { - descriptor_file->SetPreallocationBlockSize( - db_options_->manifest_preallocation_size); + descriptor_file->SetPreallocationBlockSize(manifest_preallocation_size); FileTypeSet tmp_set = db_options_->checksum_handoff_file_types; std::unique_ptr file_writer(new WritableFileWriter( std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_, @@ -5633,10 +6062,12 @@ Status VersionSet::ProcessManifestWrites( #ifndef NDEBUG if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) { TEST_SYNC_POINT_CALLBACK( - "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:" + "0", nullptr); TEST_SYNC_POINT( - "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"); + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:" + "1"); } ++idx; #endif /* !NDEBUG */ @@ -5673,8 +6104,8 @@ Status VersionSet::ProcessManifestWrites( file_options_.temperature, dir_contains_current_file); if (!io_s.ok()) { s = io_s; - // Quarantine old manifest file in case new manifest file's CURRENT file - // wasn't created successfully and the old manifest is needed. + // Quarantine old manifest file in case new manifest file's CURRENT + // file wasn't created successfully and the old manifest is needed. limbo_descriptor_log_file_number.push_back(manifest_file_number_); files_to_quarantine_if_commit_fail.push_back( &limbo_descriptor_log_file_number); @@ -5684,6 +6115,13 @@ Status VersionSet::ProcessManifestWrites( if (s.ok()) { // find offset in manifest file where this version is stored. new_manifest_file_size = raw_desc_log_ptr->file()->GetFileSize(); + if (new_descriptor_log) { + ROCKS_LOG_INFO(db_options_->info_log, + "Created manifest %" PRIu64 + ", compacted+appended from %" PRIu64 " to %" PRIu64 "\n", + pending_manifest_file_number_, prev_manifest_file_size, + new_manifest_file_size); + } } if (first_writer.edit_list.front()->IsColumnFamilyDrop()) { @@ -5732,6 +6170,8 @@ Status VersionSet::ProcessManifestWrites( descriptor_log_ = std::move(new_desc_log_ptr); obsolete_manifests_.emplace_back( DescriptorFileName("", manifest_file_number_)); + last_compacted_manifest_file_size_ = new_manifest_file_size; + TuneMaxManifestFileSize(); } // Install the new versions @@ -5741,7 +6181,8 @@ Status VersionSet::ProcessManifestWrites( assert(new_cf_options != nullptr); assert(max_last_sequence == descriptor_last_sequence_); CreateColumnFamily(*new_cf_options, read_options, - first_writer.edit_list.front()); + first_writer.edit_list.front(), + /*read_only*/ false); } else if (first_writer.edit_list.front()->IsColumnFamilyDrop()) { assert(batch_edits.size() == 1); assert(max_last_sequence == descriptor_last_sequence_); @@ -5813,21 +6254,21 @@ Status VersionSet::ProcessManifestWrites( // that renaming tmp file to CURRENT failed. // // On local POSIX-compliant FS, the CURRENT must point to the original - // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also - // keep it. Future recovery will ignore this MANIFEST. It's also ok for the - // process not to crash and continue using the db. Any future LogAndApply() - // call will switch to a new MANIFEST and update CURRENT, still ignoring - // this one. + // MANIFEST. We can delete the new MANIFEST for simplicity, but we can + // also keep it. Future recovery will ignore this MANIFEST. It's also ok + // for the process not to crash and continue using the db. Any future + // LogAndApply() call will switch to a new MANIFEST and update CURRENT, + // still ignoring this one. // // On non-local FS, it is // possible that the rename operation succeeded on the server (remote) // side, but the client somehow returns a non-ok status to RocksDB. Note // that this does not violate atomicity. Should we delete the new MANIFEST // successfully, a subsequent recovery attempt will likely see the CURRENT - // pointing to the new MANIFEST, thus fail. We will not be able to open the - // DB again. Therefore, if manifest operations succeed, we should keep the - // the new MANIFEST. If the process proceeds, any future LogAndApply() call - // will switch to a new MANIFEST and update CURRENT. If user tries to + // pointing to the new MANIFEST, thus fail. We will not be able to open + // the DB again. Therefore, if manifest operations succeed, we should keep + // the the new MANIFEST. If the process proceeds, any future LogAndApply() + // call will switch to a new MANIFEST and update CURRENT. If user tries to // re-open the DB, // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present. // b) CURRENT points to the original MANIFEST, and the original MANIFEST @@ -5956,9 +6397,9 @@ Status VersionSet::LogAndApply( first_writer.cv.Wait(); } if (first_writer.done) { - // All non-CF-manipulation operations can be grouped together and committed - // to MANIFEST. They should all have finished. The status code is stored in - // the first manifest writer. + // All non-CF-manipulation operations can be grouped together and + // committed to MANIFEST. They should all have finished. The status code + // is stored in the first manifest writer. #ifndef NDEBUG for (const auto& writer : writers) { assert(writer.done); @@ -6012,8 +6453,8 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, assert(!edit->HasLastSequence()); edit->SetLastSequence(*max_last_sequence); if (edit->IsColumnFamilyDrop()) { - // if we drop column family, we have to make sure to save max column family, - // so that we don't reuse existing ID + // if we drop column family, we have to make sure to save max column + // family, so that we don't reuse existing ID edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); } } @@ -6302,7 +6743,8 @@ void VersionSet::RecoverEpochNumbers() { Status VersionSet::ListColumnFamilies(std::vector* column_families, const std::string& dbname, FileSystem* fs) { - // Read "CURRENT" file, which contains a pointer to the current manifest file + // Read "CURRENT" file, which contains a pointer to the current manifest + // file std::string manifest_path; uint64_t manifest_file_number; Status s = GetCurrentManifestPath(dbname, fs, /*is_retry=*/false, @@ -6364,17 +6806,19 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, const ReadOptions read_options; const WriteOptions write_options; - ImmutableDBOptions db_options(*options); + ImmutableDBOptions imm_db_options(*options); + MutableDBOptions mutable_db_options(*options); ColumnFamilyOptions cf_options(*options); std::shared_ptr tc(NewLRUCache(options->max_open_files - 10, options->table_cache_numshardbits)); WriteController wc(options->delayed_write_rate); WriteBufferManager wb(options->db_write_buffer_size); - VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc, - nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/, + VersionSet versions(dbname, &imm_db_options, mutable_db_options, file_options, + tc.get(), &wb, &wc, nullptr /*BlockCacheTracer*/, + nullptr /*IOTracer*/, /*db_id*/ "", /*db_session_id*/ "", options->daily_offpeak_time_utc, - /*error_handler_*/ nullptr, /*read_only=*/false); + /*error_handler_*/ nullptr, /*unchanging=*/false); Status status; std::vector dummy; @@ -6457,9 +6901,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, // Get the checksum information including the checksum and checksum function // name of all SST and blob files in VersionSet. Store the information in -// FileChecksumList which contains a map from file number to its checksum info. -// If DB is not running, make sure call VersionSet::Recover() to load the file -// metadata from Manifest to VersionSet before calling this function. +// FileChecksumList which contains a map from file number to its checksum +// info. If DB is not running, make sure call VersionSet::Recover() to load +// the file metadata from Manifest to VersionSet before calling this function. Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { // Clean the previously stored checksum information if any. Status s; @@ -6601,8 +7045,8 @@ Status VersionSet::WriteCurrentStateToManifest( // WARNING: This method doesn't hold a mutex!! // This is done without DB mutex lock held, but only within single-threaded - // LogAndApply. Column family manipulations can only happen within LogAndApply - // (the same single thread), so we're safe to iterate. + // LogAndApply. Column family manipulations can only happen within + // LogAndApply (the same single thread), so we're safe to iterate. assert(io_s.ok()); if (db_options_->write_dbid_to_manifest) { @@ -6636,9 +7080,9 @@ Status VersionSet::WriteCurrentStateToManifest( } // New manifest should rollover the WAL deletion record from previous - // manifest. Otherwise, when an addition record of a deleted WAL gets added to - // this new manifest later (which can happens in e.g, SyncWAL()), this new - // manifest creates an illusion that such WAL hasn't been deleted. + // manifest. Otherwise, when an addition record of a deleted WAL gets added + // to this new manifest later (which can happens in e.g, SyncWAL()), this + // new manifest creates an illusion that such WAL hasn't been deleted. VersionEdit wal_deletions; wal_deletions.DeleteWalsBefore(min_log_number_to_keep()); std::string wal_deletions_record; @@ -6698,7 +7142,6 @@ Status VersionSet::WriteCurrentStateToManifest( for (const auto& f : level_files) { assert(f); - edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, @@ -6707,7 +7150,8 @@ Status VersionSet::WriteCurrentStateToManifest( f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, f->compensated_range_deletion_size, f->tail_size, - f->user_defined_timestamps_persisted); + f->user_defined_timestamps_persisted, f->min_timestamp, + f->max_timestamp); } } @@ -6770,9 +7214,9 @@ Status VersionSet::WriteCurrentStateToManifest( // TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this // function is called repeatedly with consecutive pairs of slices. For example // if the slice list is [a, b, c, d] this function is called with arguments -// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where -// we avoid doing binary search for the keys b and c twice and instead somehow -// maintain state of where they first appear in the files. +// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible +// where we avoid doing binary search for the keys b and c twice and instead +// somehow maintain state of where they first appear in the files. uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, const ReadOptions& read_options, Version* v, const Slice& start, @@ -6793,19 +7237,20 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, } // Outline of the optimization that uses options.files_size_error_margin. - // When approximating the files total size that is used to store a keys range, - // we first sum up the sizes of the files that fully fall into the range. - // Then we sum up the sizes of all the files that may intersect with the range - // (this includes all files in L0 as well). Then, if total_intersecting_size - // is smaller than total_full_size * options.files_size_error_margin - we can - // infer that the intersecting files have a sufficiently negligible - // contribution to the total size, and we can approximate the storage required - // for the keys in range as just half of the intersecting_files_size. - // E.g., if the value of files_size_error_margin is 0.1, then the error of the - // approximation is limited to only ~10% of the total size of files that fully - // fall into the keys range. In such case, this helps to avoid a costly - // process of binary searching the intersecting files that is required only - // for a more precise calculation of the total size. + // When approximating the files total size that is used to store a keys + // range, we first sum up the sizes of the files that fully fall into the + // range. Then we sum up the sizes of all the files that may intersect with + // the range (this includes all files in L0 as well). Then, if + // total_intersecting_size is smaller than total_full_size * + // options.files_size_error_margin - we can infer that the intersecting + // files have a sufficiently negligible contribution to the total size, and + // we can approximate the storage required for the keys in range as just + // half of the intersecting_files_size. E.g., if the value of + // files_size_error_margin is 0.1, then the error of the approximation is + // limited to only ~10% of the total size of files that fully fall into the + // keys range. In such case, this helps to avoid a costly process of binary + // searching the intersecting files that is required only for a more precise + // calculation of the total size. autovector first_files; autovector last_files; @@ -6877,10 +7322,11 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, total_intersecting_size += file_ptr->fd.GetFileSize(); } - // Now scan all the first & last files at each level, and estimate their size. - // If the total_intersecting_size is less than X% of the total_full_size - we - // want to approximate the result in order to avoid the costly binary search - // inside ApproximateSize. We use half of file size as an approximation below. + // Now scan all the first & last files at each level, and estimate their + // size. If the total_intersecting_size is less than X% of the + // total_full_size - we want to approximate the result in order to avoid the + // costly binary search inside ApproximateSize. We use half of file size as + // an approximation below. const double margin = options.files_size_error_margin; if (margin > 0 && total_intersecting_size < @@ -7148,7 +7594,8 @@ InternalIterator* VersionSet::MakeInputIterator( /*no per level latency histogram=*/nullptr, TableReaderCaller::kCompaction, /*skip_filters=*/false, /*level=*/static_cast(c->level(which)), range_del_agg, - c->boundaries(which), false, &tombstone_iter_ptr); + c->boundaries(which), false, &tombstone_iter_ptr, + db_options_->statistics.get(), clock_); range_tombstones.emplace_back(nullptr, tombstone_iter_ptr); } } @@ -7294,8 +7741,10 @@ uint64_t VersionSet::GetObsoleteSstFilesSize() const { ColumnFamilyData* VersionSet::CreateColumnFamily( const ColumnFamilyOptions& cf_options, const ReadOptions& read_options, - const VersionEdit* edit) { + const VersionEdit* edit, bool read_only) { assert(edit->IsColumnFamilyAdd()); + // Unchanging LSM tree implies no writes to the CF + assert(!unchanging_ || read_only); MutableCFOptions dummy_cf_options; Version* dummy_versions = @@ -7305,7 +7754,7 @@ ColumnFamilyData* VersionSet::CreateColumnFamily( dummy_versions->Ref(); auto new_cfd = column_family_set_->CreateColumnFamily( edit->GetColumnFamilyName(), edit->GetColumnFamily(), dummy_versions, - cf_options); + cf_options, read_only); Version* v = new Version(new_cfd, this, file_options_, new_cfd->GetLatestMutableCFOptions(), io_tracer_, @@ -7421,15 +7870,16 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, } ReactiveVersionSet::ReactiveVersionSet( - const std::string& dbname, const ImmutableDBOptions* _db_options, + const std::string& dbname, const ImmutableDBOptions* imm_db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& _file_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, WriteController* write_controller, const std::shared_ptr& io_tracer) - : VersionSet(dbname, _db_options, _file_options, table_cache, - write_buffer_manager, write_controller, + : VersionSet(dbname, imm_db_options, mutable_db_options, _file_options, + table_cache, write_buffer_manager, write_controller, /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "", /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "", - /*error_handler=*/nullptr, /*read_only=*/true) {} + /*error_handler=*/nullptr, /*unchanging=*/false) {} ReactiveVersionSet::~ReactiveVersionSet() = default; @@ -7550,8 +8000,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest( } } else if (s.IsPathNotFound()) { // This can happen if the primary switches to a new MANIFEST after the - // secondary reads the CURRENT file but before the secondary actually tries - // to open the MANIFEST. + // secondary reads the CURRENT file but before the secondary actually + // tries to open the MANIFEST. s = Status::TryAgain( "The primary may have switched to a new MANIFEST and deleted the old " "one."); diff --git a/db/version_set.h b/db/version_set.h index 72ae58f162c8..47a677cf59e6 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -200,7 +200,8 @@ class VersionStorageInfo { // REQUIRES: db_mutex held!! // TODO find a better way to pass compaction_options_fifo. void ComputeCompactionScore(const ImmutableOptions& immutable_options, - const MutableCFOptions& mutable_cf_options); + const MutableCFOptions& mutable_cf_options, + const std::string& full_history_ts_low); // Estimate est_comp_needed_bytes_ void EstimateCompactionBytesNeeded( @@ -230,8 +231,15 @@ class VersionStorageInfo { // oldest snapshot changes as that is when bottom-level files can become // eligible for compaction. // + // For columns with User Defined Timestamps (UDT), also checks that the + // file's largest timestamp is below full_history_ts_low before marking, + // since compaction can only collapse timestamp when it is below this + // threshold. + // // REQUIRES: DB mutex held - void ComputeBottommostFilesMarkedForCompaction(bool allow_ingest_behind); + void ComputeBottommostFilesMarkedForCompaction( + bool allow_ingest_behind, const Comparator* ucmp, + const std::string& full_history_ts_low); // This computes files_marked_for_forced_blob_gc_ and is called by // ComputeCompactionScore() @@ -248,7 +256,8 @@ class VersionStorageInfo { // files marked for compaction. // REQUIRES: DB mutex held void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum, - bool allow_ingest_behind); + bool allow_ingest_behind, const Comparator* ucmp, + const std::string& full_history_ts_low); int MaxInputLevel() const; int MaxOutputLevel(bool allow_ingest_behind) const; @@ -268,8 +277,13 @@ class VersionStorageInfo { bool expand_range = true, // if set, returns files which overlap the // range and overlap each other. If false, // then just files intersecting the range - InternalKey** next_smallest = nullptr) // if non-null, returns the - const; // smallest key of next file not included + const FileMetaData* starting_l0_file = + nullptr, // If not null, restricts L0 file selection to only include + // files at or older than starting_l0_file. + InternalKey** next_smallest = + nullptr // if non-null, returns the + // smallest key of next file not included + ) const; void GetCleanInputsWithinInterval( int level, const InternalKey* begin, // nullptr means before all keys const InternalKey* end, // nullptr means after all keys @@ -286,8 +300,10 @@ class VersionStorageInfo { int hint_index, // index of overlap file int* file_index, // return index of overlap file bool within_interval = false, // if set, force the inputs within interval - InternalKey** next_smallest = nullptr) // if non-null, returns the - const; // smallest key of next file not included + InternalKey** next_smallest = + nullptr // if non-null, returns the + // smallest key of next file not included + ) const; // Returns true iff some file in the specified level overlaps // some part of [*smallest_user_key,*largest_user_key]. @@ -630,7 +646,8 @@ class VersionStorageInfo { const Slice& largest_user_key, int last_level, int last_l0_idx); - Env::WriteLifeTimeHint CalculateSSTWriteHint(int level) const; + Env::WriteLifeTimeHint CalculateSSTWriteHint( + int level, CompactionStyleSet compaction_style_set) const; const Comparator* user_comparator() const { return user_comparator_; } @@ -668,6 +685,8 @@ class VersionStorageInfo { // List of files per level, files in each level are arranged // in increasing order of keys + // In L0, files are ordered in decreasing epoch number, meaning + // more recent updates are ordered first. std::vector* files_; // Map of all table files in version. Maps file number to (level, position on @@ -993,17 +1012,21 @@ class Version { const FileMetaData* file_meta, const std::string* fname = nullptr) const; - // REQUIRES: lock is held // On success, *props will be populated with all SSTables' table properties. // The keys of `props` are the sst file name, the values of `props` are the // tables' properties, represented as std::shared_ptr. Status GetPropertiesOfAllTables(const ReadOptions& read_options, - TablePropertiesCollection* props); + TablePropertiesCollection* props) const; Status GetPropertiesOfAllTables(const ReadOptions& read_options, - TablePropertiesCollection* props, int level); + TablePropertiesCollection* props, + int level) const; Status GetPropertiesOfTablesInRange(const ReadOptions& read_options, const autovector& ranges, TablePropertiesCollection* props) const; + Status GetPropertiesOfTablesByLevel( + const ReadOptions& read_options, + std::vector>* props_by_level) + const; // Print summary of range delete tombstones in SST files into out_str, // with maximum max_entries_to_print entries printed out. @@ -1037,6 +1060,10 @@ class Version { void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + // Get column family metadata with optional filtering by key range and level. + void GetColumnFamilyMetaData(const GetColumnFamilyMetaDataOptions& options, + ColumnFamilyMetaData* cf_meta); + void GetSstFilesBoundaryKeys(Slice* smallest_user_key, Slice* largest_user_key); @@ -1174,9 +1201,14 @@ class AtomicGroupReadBuffer { // VersionSet is the collection of versions of all the column families of the // database. Each database owns one VersionSet. A VersionSet has access to all // column families via ColumnFamilySet, i.e. set of the column families. +// `unchanging` means the LSM tree structure of the column families will not +// change during the lifetime of this VersionSet (true for read-only instance, +// but false for secondary instance or writable DB). class VersionSet { public: - VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, + VersionSet(const std::string& dbname, + const ImmutableDBOptions* imm_db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, WriteController* write_controller, @@ -1184,7 +1216,7 @@ class VersionSet { const std::shared_ptr& io_tracer, const std::string& db_id, const std::string& db_session_id, const std::string& daily_offpeak_time_utc, - ErrorHandler* const error_handler, const bool read_only); + ErrorHandler* error_handler, bool unchanging); // No copying allowed VersionSet(const VersionSet&) = delete; void operator=(const VersionSet&) = delete; @@ -1193,6 +1225,13 @@ class VersionSet { virtual Status Close(FSDirectory* db_dir, InstrumentedMutex* mu); + // Requires: already holding DB mutex `mu`, to ensure + // * Safely read values from `updated_options` + // * Safely update fields on `this` (must be read elsewhere while holding mu) + // except `mu` can be nullptr during initialization + void UpdatedMutableDbOptions(const MutableDBOptions& updated_options, + InstrumentedMutex* mu); + Status LogAndApplyToDefaultColumnFamily( const ReadOptions& read_options, const WriteOptions& write_options, VersionEdit* edit, InstrumentedMutex* mu, @@ -1263,8 +1302,11 @@ class VersionSet { void WakeUpWaitingManifestWriters(); // Recover the last saved descriptor (MANIFEST) from persistent storage. - // If read_only == true, Recover() will not complain if some column families - // are not opened + // Unlike `unchanging` on the VersionSet, `read_only` here and in other + // functions below refers to the CF receiving no writes or modifications + // through this VersionSet, but could through external manifest updates + // etc. Thus, `read_only=true` for secondary instances as well as read-only + // instances. Status Recover(const std::vector& column_families, bool read_only = false, std::string* db_id = nullptr, bool no_error_if_files_missing = false, bool is_retry = false, @@ -1342,6 +1384,8 @@ class VersionSet { return min_log_number_to_keep_.load(); } + bool unchanging() const { return unchanging_; } + // Allocate and return a new file number uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } @@ -1390,6 +1434,29 @@ class VersionSet { return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst); } + // Sync last_sequence_ with last_allocated_sequence_. This should be called + // during error recovery to ensure that any sequence numbers that were + // allocated (written to WAL) but not yet published are accounted for when + // creating new memtables/WALs. This prevents the "sequence number going + // backwards" corruption on subsequent recovery. + // + // This is necessary because with two_write_queues=true, writes allocate + // sequence numbers via FetchAddLastAllocatedSequence() before the write + // is complete, but only publish via SetLastSequence() after success. + // If an error occurs and recovery creates new memtables, SwitchMemtable + // uses LastSequence() which may be lower than already-allocated sequences. + // + // REQUIRED: DB mutex is held and no concurrent writers are active (i.e., + // after WaitForBackgroundWork() in ResumeImpl). + void SyncLastSequenceWithAllocated() { + uint64_t alloc_seq = + last_allocated_sequence_.load(std::memory_order_seq_cst); + uint64_t last_seq = last_sequence_.load(std::memory_order_acquire); + if (alloc_seq > last_seq) { + last_sequence_.store(alloc_seq, std::memory_order_release); + } + } + // Mark the specified file number as used. // REQUIRED: this is only called during single-threaded recovery or repair. void MarkFileNumberUsed(uint64_t number); @@ -1533,10 +1600,6 @@ class VersionSet { } const FileOptions& file_options() { return file_options_; } - void ChangeFileOptions(const MutableDBOptions& new_options) { - file_options_.writable_file_max_buffer_size = - new_options.writable_file_max_buffer_size; - } // TODO - Consider updating together when file options change in SetDBOptions const OffpeakTimeOption& offpeak_time_option() { @@ -1573,6 +1636,18 @@ class VersionSet { AppendVersion(cfd, version); } + bool& TEST_unchanging() { return const_cast(unchanging_); } + + uint64_t TEST_GetMinMaxManifestFileSize() { + return min_max_manifest_file_size_; + } + unsigned TEST_GetMaxManifestSpaceAmpPct() { + return max_manifest_space_amp_pct_; + } + size_t TEST_GetManifestPreallocationSize() { + return manifest_preallocation_size_; + } + protected: struct ManifestWriter; @@ -1593,6 +1668,7 @@ class VersionSet { } }; + // Revert back to a post-construction state (keep same options/settings) void Reset(); // Returns approximated offset of a key in a file for a given version. @@ -1625,12 +1701,17 @@ class VersionSet { ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, const ReadOptions& read_options, - const VersionEdit* edit); + const VersionEdit* edit, bool read_only); Status VerifyFileMetadata(const ReadOptions& read_options, ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& meta); + // Auto-tune next max size for the current manifest file based on its initial + // "compacted" size and other parameters saved in this VersionSet. Must be + // holding DB mutex if outside of DB startup. + void TuneMaxManifestFileSize(); + // Protected by DB mutex. WalSet wals_; @@ -1657,6 +1738,9 @@ class VersionSet { // The last sequence number of data committed to the descriptor (manifest // file). SequenceNumber descriptor_last_sequence_ = 0; + // See write_prepared_txn.h for a more detailed description of how Write + // Prepared transactions work, with concrete examples. + // // The last seq that is already allocated. It is applicable only when we have // two write queues. In that case seq might or might not have appreated in // memtable but it is expected to appear in the WAL. @@ -1682,6 +1766,20 @@ class VersionSet { // Current size of manifest file uint64_t manifest_file_size_; + // Size of the populated manifest file last time it was re-written from + // scratch. + uint64_t last_compacted_manifest_file_size_; + + // Auto-tuned max allowed size for the current manifest file + uint64_t tuned_max_manifest_file_size_; + + // Saved copy of max_manifest_file_size in (Mutable)DBOptions + uint64_t min_max_manifest_file_size_; + // Saved, sanitized copy from (Mutable)DBOptions + unsigned max_manifest_space_amp_pct_; + // Saved copy from (Mutable)DBOptions + size_t manifest_preallocation_size_; + // Obsolete files, or during DB shutdown any files not referenced by what's // left of the in-memory LSM state. std::vector obsolete_files_; @@ -1722,7 +1820,7 @@ class VersionSet { VersionEdit* edit, SequenceNumber* max_last_sequence, InstrumentedMutex* mu); - const bool read_only_; + const bool unchanging_; bool closed_; }; @@ -1734,6 +1832,7 @@ class ReactiveVersionSet : public VersionSet { public: ReactiveVersionSet(const std::string& dbname, const ImmutableDBOptions* _db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& _file_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, WriteController* write_controller, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index c249fa6dafad..a4cf2698c078 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -26,6 +26,7 @@ #include "test_util/mock_time_env.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/defer.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -55,7 +56,8 @@ class GenerateLevelFilesBriefTest : public testing::Test { kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, - /* user_defined_timestamps_persisted */ true); + /* user_defined_timestamps_persisted */ true, /* min timestamp */ "", + /* max timestamp */ ""); files_.push_back(f); } @@ -171,7 +173,8 @@ class VersionStorageInfoTestBase : public testing::Test { kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, compensated_range_deletion_size, 0, - /* user_defined_timestamps_persisted */ true); + /* user_defined_timestamps_persisted */ true, /* min timestamp */ "", + /* max timestamp */ ""); vstorage_.AddFile(level, f); } @@ -390,7 +393,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) { ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); - vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_, + /*full_history_ts_low=*/""); // Only L0 hits compaction. ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0); } @@ -420,7 +424,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) { ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); - vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_, + /*full_history_ts_low=*/""); // Although L2 and l3 have higher unadjusted compaction score, considering // a relatively large L0 being compacted down soon, L4 is picked up for // compaction. @@ -452,7 +457,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { ASSERT_EQ(2, vstorage_.base_level()); ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2)); - vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_, + /*full_history_ts_low=*/""); // Although L2 has higher unadjusted compaction score, considering // a relatively large L0 being compacted down soon, L3 is picked up for // compaction. @@ -482,7 +488,8 @@ TEST_F(VersionStorageInfoTest, DrainUnnecessaryLevel) { ASSERT_EQ(1, vstorage_.base_level()); ASSERT_EQ(1000, vstorage_.MaxBytesForLevel(1)); ASSERT_EQ(10100, vstorage_.MaxBytesForLevel(3)); - vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_, + /*full_history_ts_low=*/""); // Tests that levels 1 and 3 are eligible for compaction. // Levels 1 and 3 are much smaller than target size, @@ -1158,12 +1165,12 @@ class VersionSetTestBase { : env_(nullptr), dbname_(test::PerThreadDBPath(name)), options_(), - db_options_(options_), + imm_db_options_(options_), cf_options_(options_), - immutable_options_(db_options_, cf_options_), + immutable_options_(imm_db_options_, cf_options_), mutable_cf_options_(cf_options_), table_cache_(NewLRUCache(50000, 16)), - write_buffer_manager_(db_options_.db_write_buffer_size), + write_buffer_manager_(imm_db_options_.db_write_buffer_size), shutting_down_(false), table_factory_(std::make_shared()) { EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_)); @@ -1177,8 +1184,8 @@ class VersionSetTestBase { EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr)); options_.env = env_; - db_options_.env = env_; - db_options_.fs = fs_; + imm_db_options_.env = env_; + imm_db_options_.fs = fs_; immutable_options_.env = env_; immutable_options_.fs = fs_; immutable_options_.clock = env_->GetSystemClock().get(); @@ -1187,16 +1194,17 @@ class VersionSetTestBase { mutable_cf_options_.table_factory = table_factory_; versions_.reset(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", /*error_handler=*/nullptr, /*read_only=*/false)); reactive_versions_ = std::make_shared( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, nullptr); - db_options_.db_paths.emplace_back(dbname_, - std::numeric_limits::max()); + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, + nullptr); + imm_db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); } virtual ~VersionSetTestBase() { @@ -1219,7 +1227,7 @@ class VersionSetTestBase { ASSERT_OK( SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown)); VersionEdit new_db; - if (db_options_.write_dbid_to_manifest) { + if (imm_db_options_.write_dbid_to_manifest) { DBOptions tmp_db_options; tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); @@ -1344,7 +1352,8 @@ class VersionSetTestBase { Temperature::kUnknown, info.oldest_blob_file_number, 0, 0, info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, - /* user_defined_timestamps_persisted */ true); + /* user_defined_timestamps_persisted */ true, /* min timestamp */ "", + /* max timestamp */ ""); if (info.file_missing) { ASSERT_OK(fs_->DeleteFile(fname, IOOptions(), nullptr)); } @@ -1380,8 +1389,8 @@ class VersionSetTestBase { void ReopenDB() { versions_.reset(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", /*error_handler=*/nullptr, /*read_only=*/false)); @@ -1470,7 +1479,8 @@ class VersionSetTestBase { const std::string dbname_; EnvOptions env_options_; Options options_; - ImmutableDBOptions db_options_; + ImmutableDBOptions imm_db_options_; + MutableDBOptions mutable_db_options_; ColumnFamilyOptions cf_options_; ImmutableOptions immutable_options_; MutableCFOptions mutable_cf_options_; @@ -1901,11 +1911,11 @@ TEST_F(VersionSetTest, WalAddition) { // Recover a new VersionSet. { std::unique_ptr new_versions(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + /*error_handler=*/nullptr, /*unchanging=*/false)); ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -1969,11 +1979,11 @@ TEST_F(VersionSetTest, WalCloseWithoutSync) { // Recover a new VersionSet. { std::unique_ptr new_versions(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + /*error_handler=*/nullptr, /*unchanging=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 2); @@ -2023,11 +2033,11 @@ TEST_F(VersionSetTest, WalDeletion) { // Recover a new VersionSet, only the non-closed WAL should show up. { std::unique_ptr new_versions(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + /*error_handler=*/nullptr, /*unchanging=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -2062,11 +2072,11 @@ TEST_F(VersionSetTest, WalDeletion) { // Recover from the new MANIFEST, only the non-closed WAL should show up. { std::unique_ptr new_versions(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + /*error_handler=*/nullptr, /*unchanging=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -2183,11 +2193,11 @@ TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) { // Recover a new VersionSet, WAL0 is deleted, WAL1 is not. { std::unique_ptr new_versions(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + /*error_handler=*/nullptr, /*unchanging=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -2220,11 +2230,11 @@ TEST_F(VersionSetTest, DeleteAllWals) { // Recover a new VersionSet, all WALs are deleted. { std::unique_ptr new_versions(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + /*error_handler=*/nullptr, /*unchanging=*/false)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 0); @@ -2263,11 +2273,11 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) { // kept. { std::unique_ptr new_versions(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + /*error_handler=*/nullptr, /*unchanging=*/false)); std::string db_id; ASSERT_OK( new_versions->Recover(column_families_, /*read_only=*/false, &db_id)); @@ -2443,11 +2453,11 @@ class VersionSetWithTimestampTest : public VersionSetTest { void VerifyFullHistoryTsLow(uint64_t expected_ts_low) { std::unique_ptr vset(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &imm_db_options_, mutable_db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", - /*error_handler=*/nullptr, /*read_only=*/false)); + /*error_handler=*/nullptr, /*unchanging=*/false)); ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false, /*db_id=*/nullptr)); for (auto* cfd : *(vset->GetColumnFamilySet())) { @@ -3499,7 +3509,7 @@ class VersionSetTestEmptyDb std::unique_ptr* log_writer) override { assert(nullptr != log_writer); VersionEdit new_db; - if (db_options_.write_dbid_to_manifest) { + if (imm_db_options_.write_dbid_to_manifest) { ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown)); DBOptions tmp_db_options; @@ -3531,7 +3541,7 @@ class VersionSetTestEmptyDb const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown"; TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { - db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); log_writer_.reset(); CreateCurrentFile(); @@ -3563,7 +3573,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { } TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) { - db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); // Only a subset of column families in the MANIFEST. VersionEdit new_cf1; @@ -3604,7 +3614,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) { } TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) { - db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); // Write all column families but no log_number, next_file_number and // last_sequence. @@ -3650,7 +3660,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) { } TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { - db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); // Write all column families but no log_number, next_file_number and // last_sequence. @@ -3707,7 +3717,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { } TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { - db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); PrepareManifest(nullptr, nullptr, &log_writer_); // Write all column families but no log_number, next_file_number and // last_sequence. @@ -3749,6 +3759,8 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { } std::string db_id; bool has_missing_table_file = false; + SaveAndRestore override_unchanging(&versions_->TEST_unchanging(), + read_only); s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, read_only, &db_id, &has_missing_table_file); @@ -3825,7 +3837,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, ASSERT_OK(s); log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); VersionEdit new_db; - if (db_options_.write_dbid_to_manifest) { + if (imm_db_options_.write_dbid_to_manifest) { DBOptions tmp_db_options; tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); @@ -3935,7 +3947,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, - /* user_defined_timestamps_persisted */ true); + /* user_defined_timestamps_persisted */ true, /* min timestamp */ "", + /* max timestamp */ ""); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3996,7 +4009,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, - /* user_defined_timestamps_persisted */ true); + /* user_defined_timestamps_persisted */ true, /* min timestamp */ "", + /* max timestamp */ ""); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -4085,7 +4099,7 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) { } TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) { - db_options_.allow_2pc = true; + imm_db_options_.allow_2pc = true; NewDB(); SstInfo sst(100, kDefaultColumnFamilyName, "a", 0 /* level */, diff --git a/db/version_util.h b/db/version_util.h index 2690a00f48d9..7219d11c854b 100644 --- a/db/version_util.h +++ b/db/version_util.h @@ -1,4 +1,4 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) Meta Platforms, Inc. and affiliates. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). @@ -23,7 +23,8 @@ class OfflineManifestWriter { immutable_db_options_(WithDbPath(options, db_path)), tc_(NewLRUCache(1 << 20 /* capacity */, options.table_cache_numshardbits)), - versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_, + versions_(db_path, &immutable_db_options_, MutableDBOptions{options}, + sopt_, tc_.get(), &wb_, &wc_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", options.daily_offpeak_time_utc, diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 60e85567be4a..67582c80552f 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -192,7 +192,13 @@ void WalManager::PurgeObsoleteWALFiles() { s.ToString().c_str()); continue; } - if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { + + // Avoid expression `now_seconds - file_m_time` when + // `file_m_time > now_seconds` to prevent unsigned underflow in case + // system clock goes backwards. Both timestamps are based on wall clock + // time, which is not guaranteed to be monotonic. + if (file_m_time <= now_seconds && + now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { s = DeleteDBFile(&db_options_, file_path, archival_dir, false, /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { @@ -283,6 +289,7 @@ void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) { // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1"); Status s = env_->RenameFile(fname, archived_log_name); + IGNORE_STATUS_IF_ERROR(s); // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2"); // The sync point below is used in diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 5b5ba7c0a872..e674e7b778c9 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -19,6 +19,7 @@ #include "rocksdb/write_batch.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" +#include "test_util/mock_time_env.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" @@ -39,7 +40,7 @@ class WalManagerTest : public testing::Test { EXPECT_OK(DestroyDB(dbname_, Options())); } - void Init() { + void Init(SystemClock* clock_override) { ASSERT_OK(env_->CreateDirIfMissing(dbname_)); ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_))); db_options_.db_paths.emplace_back(dbname_, @@ -47,11 +48,15 @@ class WalManagerTest : public testing::Test { db_options_.wal_dir = dbname_; db_options_.env = env_.get(); db_options_.fs = env_->GetFileSystem(); - db_options_.clock = env_->GetSystemClock().get(); + if (clock_override == nullptr) { + db_options_.clock = env_->GetSystemClock().get(); + } else { + db_options_.clock = clock_override; + } versions_.reset(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, + dbname_, &db_options_, MutableDBOptions{}, env_options_, + table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", /*error_handler=*/nullptr, /*read_only=*/false)); @@ -124,7 +129,7 @@ class WalManagerTest : public testing::Test { }; TEST_F(WalManagerTest, ReadFirstRecordCache) { - Init(); + Init(nullptr /* clock_override */); std::string path = dbname_ + "/000001.log"; std::unique_ptr file; ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file, @@ -221,7 +226,7 @@ int CountRecords(TransactionLogIterator* iter) { TEST_F(WalManagerTest, WALArchivalSizeLimit) { db_options_.WAL_ttl_seconds = 0; db_options_.WAL_size_limit_MB = 1000; - Init(); + Init(nullptr /* clock_override */); // TEST : Create WalManager with huge size limit and no ttl. // Create some archived files and call PurgeObsoleteWALFiles(). @@ -258,7 +263,7 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) { TEST_F(WalManagerTest, WALArchivalTtl) { db_options_.WAL_ttl_seconds = 1000; - Init(); + Init(nullptr /* clock_override */); // TEST : Create WalManager with a ttl and no size limit. // Create some archived log files and call PurgeObsoleteWALFiles(). @@ -282,8 +287,41 @@ TEST_F(WalManagerTest, WALArchivalTtl) { ASSERT_TRUE(log_files.empty()); } +TEST_F(WalManagerTest, WALArchivalTtlClockGoesBackwards) { + // This test used to trigger an unsigned underflow bug, where WAL files were + // incorrectly deleted when the system time moved backwards between writing + // to a WAL and running `WalManager::PurgeObsoleteWALFiles()`. + constexpr int kNumLogs = 5; + constexpr int kEntriesPerLog = 100; + + db_options_.WAL_ttl_seconds = 86400; // One day + + // Configure mock clock to lag one second behind system time. That way, the + // WAL file's mtime will appear to be in the future when + // `WalManager::PurgeObsoleteWALFiles()` runs. + int64_t now_seconds; + ASSERT_OK(env_->GetSystemClock()->GetCurrentTime(&now_seconds)); + auto mock_clock = std::make_shared(env_->GetSystemClock()); + mock_clock->SetCurrentTime(static_cast(now_seconds - 1)); + db_options_.clock = mock_clock.get(); + + Init(mock_clock.get() /* clock */); + + CreateArchiveLogs(kNumLogs, kEntriesPerLog); + + const std::string archive_dir = ArchivalDirectory(dbname_); + ASSERT_EQ(kNumLogs, + ListSpecificFiles(env_.get(), archive_dir, kWalFile).size()); + + wal_manager_->PurgeObsoleteWALFiles(); + + // All files must still be present because TTL has not elapsed. + ASSERT_EQ(kNumLogs, + ListSpecificFiles(env_.get(), archive_dir, kWalFile).size()); +} + TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) { - Init(); + Init(nullptr /* clock_override */); RollTheLog(false); Put("key1", std::string(1024, 'a')); // Create a zero record WAL file. @@ -297,7 +335,7 @@ TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) { } TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) { - Init(); + Init(nullptr /* clock_override */); RollTheLog(false); auto iter = OpenTransactionLogIter(0); // Check that an empty iterator is returned @@ -305,7 +343,7 @@ TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) { } TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) { - Init(); + Init(nullptr /* clock_override */); CreateArchiveLogs(2, 100); auto iter = OpenTransactionLogIter(0); CreateArchiveLogs(1, 100); diff --git a/db/wide/db_wide_basic_test.cc b/db/wide/db_wide_basic_test.cc index 886f71d7452f..5c46c3c6443f 100644 --- a/db/wide/db_wide_basic_test.cc +++ b/db/wide/db_wide_basic_test.cc @@ -714,7 +714,7 @@ TEST_F(DBWideBasicTest, MergePlainKeyValue) { // snapshot in between to make sure they do not get reconciled during the // subsequent flush) write_base(); - ManagedSnapshot snapshot(db_); + ManagedSnapshot snapshot(db_.get()); write_merge(); verify(); @@ -958,7 +958,7 @@ TEST_F(DBWideBasicTest, MergeEntity) { // between to make sure they do not get reconciled during the subsequent // flush) write_base(); - ManagedSnapshot snapshot(db_); + ManagedSnapshot snapshot(db_.get()); write_merge(); verify_basic(); verify_merge_ops_pre_compaction(); @@ -1033,7 +1033,7 @@ class DBWideMergeV3Test : public DBWideBasicTest { third_key, third_columns)); // wide-column base value - snapshots_.emplace_back(db_); + snapshots_.emplace_back(db_.get()); // First round of merge operands ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, @@ -1043,7 +1043,7 @@ class DBWideMergeV3Test : public DBWideBasicTest { ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key, third_merge_op1)); - snapshots_.emplace_back(db_); + snapshots_.emplace_back(db_.get()); // Second round of merge operands ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, @@ -1053,7 +1053,7 @@ class DBWideMergeV3Test : public DBWideBasicTest { ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key, third_merge_op2)); - snapshots_.emplace_back(db_); + snapshots_.emplace_back(db_.get()); } void VerifyKeyValues(const WideColumns& first_expected, diff --git a/db/wide/wide_column_serialization.cc b/db/wide/wide_column_serialization.cc index 0366a5db977d..8371b7cbbd30 100644 --- a/db/wide/wide_column_serialization.cc +++ b/db/wide/wide_column_serialization.cc @@ -5,10 +5,12 @@ #include "db/wide/wide_column_serialization.h" -#include #include -#include +#include +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_index.h" +#include "db/blob/prefetch_buffer_collection.h" #include "db/wide/wide_columns_helper.h" #include "rocksdb/slice.h" #include "util/autovector.h" @@ -16,15 +18,46 @@ namespace ROCKSDB_NAMESPACE { +Status WideColumnSerialization::BuildBlobIndexMap( + size_t num_columns, + const std::vector>& blob_columns, + std::vector& blob_index_map) { + if (Status s = ValidateWideColumnLimit(num_columns, "Too many wide columns"); + !s.ok()) { + return s; + } + + blob_index_map.assign(num_columns, nullptr); + for (const auto& blob_col : blob_columns) { + if (blob_col.first >= blob_index_map.size()) { + return Status::InvalidArgument("Blob column index out of range"); + } + blob_index_map[blob_col.first] = &blob_col.second; + } + + return Status::OK(); +} + +bool WideColumnSerialization::ContainsBlobType(const char* type_bytes, + uint32_t num_columns) { + for (uint32_t i = 0; i < num_columns; ++i) { + if (static_cast(type_bytes[i]) == kTypeBlobIndex) { + return true; + } + } + return false; +} + Status WideColumnSerialization::Serialize(const WideColumns& columns, std::string& output) { const size_t num_columns = columns.size(); - if (num_columns > static_cast(std::numeric_limits::max())) { - return Status::InvalidArgument("Too many wide columns"); + if (Status sv = ValidateWideColumnLimit(num_columns, "Too many wide columns"); + !sv.ok()) { + return sv; } - PutVarint32(&output, kCurrentVersion); + PutVarint32(&output, kVersion1); PutVarint32(&output, static_cast(num_columns)); @@ -34,19 +67,23 @@ Status WideColumnSerialization::Serialize(const WideColumns& columns, const WideColumn& column = columns[i]; const Slice& name = column.name(); - if (name.size() > - static_cast(std::numeric_limits::max())) { - return Status::InvalidArgument("Wide column name too long"); + if (Status s_name = + ValidateWideColumnLimit(name.size(), "Wide column name too long"); + !s_name.ok()) { + return s_name; } - if (prev_name && prev_name->compare(name) >= 0) { - return Status::Corruption("Wide columns out of order"); + if (prev_name) { + if (Status so = ValidateColumnOrder(*prev_name, name); !so.ok()) { + return so; + } } const Slice& value = column.value(); - if (value.size() > - static_cast(std::numeric_limits::max())) { - return Status::InvalidArgument("Wide column value too long"); + if (Status s_val = + ValidateWideColumnLimit(value.size(), "Wide column value too long"); + !s_val.ok()) { + return s_val; } PutLengthPrefixedSlice(&output, name); @@ -64,28 +101,151 @@ Status WideColumnSerialization::Serialize(const WideColumns& columns, return Status::OK(); } -Status WideColumnSerialization::Deserialize(Slice& input, - WideColumns& columns) { - assert(columns.empty()); - - uint32_t version = 0; - if (!GetVarint32(&input, &version)) { - return Status::Corruption("Error decoding wide column version"); +template +Status WideColumnSerialization::SerializeV2Impl( + size_t num_columns, + const std::vector>& blob_columns, + std::string& output, GetName get_name, GetValue get_value) { + std::vector blob_index_map; + if (Status s = BuildBlobIndexMap(num_columns, blob_columns, blob_index_map); + !s.ok()) { + return s; } + assert(blob_index_map.size() == num_columns); + + // First pass: validate column ordering, compute sizes, serialize blob + // indices, and build column types. + std::vector serialized_blob_indices(num_columns); + std::vector name_sizes(num_columns); + std::vector value_sizes(num_columns); + std::string column_types; + column_types.reserve(num_columns); + + Slice prev_name_storage; + bool has_prev = false; + uint32_t name_sizes_bytes = 0; + uint32_t names_bytes = 0; + uint32_t total_value_sizes_bytes = 0; + uint32_t total_values_bytes = 0; + + for (size_t i = 0; i < num_columns; ++i) { + const Slice name = get_name(i); + const Slice value = get_value(i); + + if (Status sn = + ValidateWideColumnLimit(name.size(), "Wide column name too long"); + !sn.ok()) { + return sn; + } - if (version > kCurrentVersion) { - return Status::NotSupported("Unsupported wide column version"); - } + if (has_prev) { + if (Status so = ValidateColumnOrder(prev_name_storage, name); !so.ok()) { + return so; + } + } - uint32_t num_columns = 0; - if (!GetVarint32(&input, &num_columns)) { - return Status::Corruption("Error decoding number of wide columns"); + name_sizes[i] = static_cast(name.size()); + name_sizes_bytes += VarintLength(name_sizes[i]); + names_bytes += name_sizes[i]; + + if (blob_index_map[i] != nullptr) { + const BlobIndex* blob_idx = blob_index_map[i]; + blob_idx->EncodeTo(&serialized_blob_indices[i]); + value_sizes[i] = static_cast(serialized_blob_indices[i].size()); + column_types.push_back(static_cast(kTypeBlobIndex)); + } else { + if (Status svl = ValidateWideColumnLimit(value.size(), + "Wide column value too long"); + !svl.ok()) { + return svl; + } + value_sizes[i] = static_cast(value.size()); + column_types.push_back(static_cast(kTypeValue)); + } + + total_value_sizes_bytes += VarintLength(value_sizes[i]); + total_values_bytes += value_sizes[i]; + + prev_name_storage = name; + has_prev = true; } - if (!num_columns) { + // Second pass: write all V2 sections to output. + // Pre-allocate output string. + const size_t total_size = + VarintLength(kVersion2) + + VarintLength(static_cast(num_columns)) + + num_columns + // column types + VarintLength(name_sizes_bytes) + VarintLength(total_value_sizes_bytes) + + VarintLength(names_bytes) + name_sizes_bytes + total_value_sizes_bytes + + names_bytes + total_values_bytes; + + const size_t base_offset = output.size(); + output.reserve(base_offset + total_size); + + // Sections 1-3: header, skip info, column types + PutVarint32(&output, kVersion2); + PutVarint32(&output, static_cast(num_columns)); + PutVarint32(&output, name_sizes_bytes); + PutVarint32(&output, total_value_sizes_bytes); + PutVarint32(&output, names_bytes); + output.append(column_types); + + // Sections 4-7: resize to final size, then write all 4 sections in a + // single loop using independent pointers. Each section's start offset is + // known from the sizes computed in the first pass. + if (num_columns == 0) { return Status::OK(); } + const size_t sec4_offset = output.size(); + output.resize(base_offset + total_size); + + char* s4 = &output[sec4_offset]; // section 4: name sizes + char* s5 = s4 + name_sizes_bytes; // section 5: value sizes + char* s6 = s5 + total_value_sizes_bytes; // section 6: names + char* s7 = s6 + names_bytes; // section 7: values + + for (size_t i = 0; i < num_columns; ++i) { + s4 = EncodeVarint32(s4, name_sizes[i]); + s5 = EncodeVarint32(s5, value_sizes[i]); + + memcpy(s6, get_name(i).data(), name_sizes[i]); + s6 += name_sizes[i]; + + if (blob_index_map[i] != nullptr) { + memcpy(s7, serialized_blob_indices[i].data(), value_sizes[i]); + } else { + memcpy(s7, get_value(i).data(), value_sizes[i]); + } + s7 += value_sizes[i]; + } + + return Status::OK(); +} + +Status WideColumnSerialization::SerializeV2( + const std::vector>& columns, + const std::vector>& blob_columns, + std::string& output) { + return SerializeV2Impl( + columns.size(), blob_columns, output, + [&](size_t i) { return Slice(columns[i].first); }, + [&](size_t i) { return Slice(columns[i].second); }); +} + +Status WideColumnSerialization::SerializeV2( + const WideColumns& columns, + const std::vector>& blob_columns, + std::string& output) { + return SerializeV2Impl( + columns.size(), blob_columns, output, + [&](size_t i) { return columns[i].name(); }, + [&](size_t i) { return columns[i].value(); }); +} + +Status WideColumnSerialization::DeserializeV1( + Slice& input, uint32_t num_columns, std::vector& columns) { columns.reserve(num_columns); autovector column_value_sizes; @@ -97,8 +257,11 @@ Status WideColumnSerialization::Deserialize(Slice& input, return Status::Corruption("Error decoding wide column name"); } - if (!columns.empty() && columns.back().name().compare(name) >= 0) { - return Status::Corruption("Wide columns out of order"); + if (!columns.empty()) { + if (Status so = ValidateColumnOrder(columns.back().name(), name); + !so.ok()) { + return so; + } } columns.emplace_back(name, Slice()); @@ -129,12 +292,324 @@ Status WideColumnSerialization::Deserialize(Slice& input, return Status::OK(); } +Status WideColumnSerialization::DeserializeV2Impl( + Slice& input, uint32_t num_columns, std::vector& columns, + std::vector& column_types) { + // Section 2: SKIP INFO (3 varints) + uint32_t name_sizes_bytes = 0; + uint32_t value_sizes_bytes = 0; + uint32_t names_bytes = 0; + if (!GetVarint32(&input, &name_sizes_bytes)) { + return Status::Corruption("Error decoding wide column name sizes bytes"); + } + if (!GetVarint32(&input, &value_sizes_bytes)) { + return Status::Corruption("Error decoding wide column value sizes bytes"); + } + if (!GetVarint32(&input, &names_bytes)) { + return Status::Corruption("Error decoding wide column names bytes"); + } + + // Section 3: COLUMN TYPES (N bytes, each is a ValueType) + if (input.size() < num_columns) { + return Status::Corruption("Error decoding wide column types"); + } + column_types.resize(num_columns); + for (uint32_t i = 0; i < num_columns; ++i) { + column_types[i] = static_cast(input[i]); + if (!IsValidColumnValueType(column_types[i])) { + return Status::Corruption("Unsupported wide column ValueType"); + } + } + input.remove_prefix(num_columns); + + // Validate that sections 4-6 fit in the remaining input + const size_t metadata_size = + name_sizes_bytes + value_sizes_bytes + names_bytes; + if (input.size() < metadata_size) { + return Status::Corruption("Error decoding wide column sections"); + } + + // Set up 4 pointers into sections 4-7 for single-loop parsing. + // Skip info gives us exact boundaries for each section. + const char* s4 = input.data(); // section 4: name sizes + const char* s4_limit = s4 + name_sizes_bytes; + const char* s5 = s4_limit; // section 5: value sizes + const char* s5_limit = s5 + value_sizes_bytes; + const char* s6 = s5_limit; // section 6: names + const char* s7 = s6 + names_bytes; // section 7: values + const char* input_end = input.data() + input.size(); + + columns.reserve(num_columns); + size_t name_pos = 0; + size_t value_pos = 0; + + for (uint32_t i = 0; i < num_columns; ++i) { + // Decode name size from section 4 + uint32_t ns = 0; + const char* s4_next = GetVarint32Ptr(s4, s4_limit, &ns); + if (s4_next == nullptr) { + return Status::Corruption("Error decoding wide column name size"); + } + s4 = s4_next; + + // Decode value size from section 5 + uint32_t vs = 0; + const char* s5_next = GetVarint32Ptr(s5, s5_limit, &vs); + if (s5_next == nullptr) { + return Status::Corruption("Error decoding wide column value size"); + } + s5 = s5_next; + + // Read name from section 6 + if (name_pos + ns > names_bytes) { + return Status::Corruption("Error decoding wide column name"); + } + Slice name(s6 + name_pos, ns); + + if (!columns.empty()) { + if (Status so = ValidateColumnOrder(columns.back().name(), name); + !so.ok()) { + return so; + } + } + + // Read value from section 7 + if (s7 + value_pos + vs > input_end) { + return Status::Corruption("Error decoding wide column value payload"); + } + + columns.emplace_back(name, Slice(s7 + value_pos, vs)); + name_pos += ns; + value_pos += vs; + } + + return Status::OK(); +} + +Status WideColumnSerialization::Deserialize(Slice& input, + WideColumns& columns) { + assert(columns.empty()); + + // Reuse DeserializeV2, then reject any blob references. + std::vector> blob_columns; + if (Status s = DeserializeV2(input, columns, blob_columns); !s.ok()) { + return s; + } + + if (!blob_columns.empty()) { + return Status::NotSupported( + "Wide column contains blob references. Use DeserializeV2."); + } + + return Status::OK(); +} + +Status WideColumnSerialization::DeserializeV2( + Slice& input, std::vector& columns, + std::vector>& blob_columns) { + assert(columns.empty()); + assert(blob_columns.empty()); + + uint32_t version = 0; + if (!GetVarint32(&input, &version)) { + return Status::Corruption("Error decoding wide column version"); + } + + if (version > kVersion2) { + return Status::NotSupported("Unsupported wide column version"); + } + + uint32_t num_columns = 0; + if (!GetVarint32(&input, &num_columns)) { + return Status::Corruption("Error decoding number of wide columns"); + } + + if (!num_columns) { + return Status::OK(); + } + + if (version >= kVersion2) { + // V2 layout: parse columns and extract blob column info + std::vector column_types; + + if (Status s = DeserializeV2Impl(input, num_columns, columns, column_types); + !s.ok()) { + return s; + } + assert(column_types.size() == num_columns); + assert(columns.size() == num_columns); + + // Decode blob indices from value data + for (uint32_t i = 0; i < num_columns; ++i) { + if (column_types[i] == kTypeBlobIndex) { + BlobIndex blob_idx; + Slice blob_slice = columns[i].value(); + if (Status bs = blob_idx.DecodeFrom(blob_slice); !bs.ok()) { + return Status::Corruption("Error decoding blob index in wide column"); + } + blob_columns.emplace_back(i, blob_idx); + } + } + } else { + return DeserializeV1(input, num_columns, columns); + } + + return Status::OK(); +} + +Status WideColumnSerialization::HasBlobColumns(const Slice& input, + bool& has_blob_columns) { + has_blob_columns = false; + + Slice input_ref = input; + + uint32_t version = 0; + if (!GetVarint32(&input_ref, &version)) { + return Status::Corruption("Error decoding wide column version"); + } + + // Version 1 never has blob columns + if (version < kVersion2) { + return Status::OK(); + } + + uint32_t num_columns = 0; + if (!GetVarint32(&input_ref, &num_columns)) { + return Status::Corruption("Error decoding number of wide columns"); + } + + if (!num_columns) { + return Status::OK(); + } + + // V2: Skip over SKIP INFO (3 varints) to reach COLUMN TYPES section. + uint32_t unused_name_sizes_bytes = 0; + uint32_t unused_value_sizes_bytes = 0; + uint32_t unused_names_bytes = 0; + if (!GetVarint32(&input_ref, &unused_name_sizes_bytes) || + !GetVarint32(&input_ref, &unused_value_sizes_bytes) || + !GetVarint32(&input_ref, &unused_names_bytes)) { + return Status::Corruption("Error decoding wide column skip info"); + } + if (input_ref.size() < num_columns) { + return Status::Corruption("Error decoding wide column types"); + } + has_blob_columns = ContainsBlobType(input_ref.data(), num_columns); + + return Status::OK(); +} + +Status WideColumnSerialization::GetVersion(const Slice& input, + uint32_t& version) { + Slice input_ref = input; + + version = 0; + if (!GetVarint32(&input_ref, &version)) { + return Status::Corruption("Error decoding wide column version"); + } + + return Status::OK(); +} + Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input, Slice& value) { + Slice input_ref = input; + + uint32_t version = 0; + if (!GetVarint32(&input_ref, &version)) { + return Status::Corruption("Error decoding wide column version"); + } + + if (version > kVersion2) { + return Status::NotSupported("Unsupported wide column version"); + } + + uint32_t num_columns = 0; + if (!GetVarint32(&input_ref, &num_columns)) { + return Status::Corruption("Error decoding number of wide columns"); + } + + if (!num_columns) { + value.clear(); + return Status::OK(); + } + + if (version >= kVersion2) { + // V2 fast path: use skip info to jump directly to values without + // scanning through variable-length sections. + + // Read SKIP INFO (3 varints, immediately after header) + uint32_t name_sizes_bytes = 0; + uint32_t value_sizes_bytes = 0; + uint32_t names_bytes = 0; + if (!GetVarint32(&input_ref, &name_sizes_bytes)) { + return Status::Corruption("Error decoding wide column name sizes bytes"); + } + if (!GetVarint32(&input_ref, &value_sizes_bytes)) { + return Status::Corruption("Error decoding wide column value sizes bytes"); + } + if (!GetVarint32(&input_ref, &names_bytes)) { + return Status::Corruption("Error decoding wide column names bytes"); + } + + // Read COLUMN TYPES (N bytes) + if (input_ref.size() < num_columns) { + return Status::Corruption("Error decoding wide column types"); + } + // Check if default column (index 0) is a blob reference + if (static_cast(input_ref[0]) == kTypeBlobIndex) { + return Status::NotSupported( + "Wide column contains blob references. Use DeserializeV2."); + } + input_ref.remove_prefix(num_columns); + + // Peek first name size from NAME SIZES section + if (input_ref.size() < name_sizes_bytes) { + return Status::Corruption("Error decoding wide column name sizes"); + } + Slice name_sizes_section(input_ref.data(), name_sizes_bytes); + uint32_t first_name_size = 0; + if (!GetVarint32(&name_sizes_section, &first_name_size)) { + return Status::Corruption("Error decoding wide column name size"); + } + input_ref.remove_prefix(name_sizes_bytes); + + // Peek first value size from VALUE SIZES section + if (input_ref.size() < value_sizes_bytes) { + return Status::Corruption("Error decoding wide column value sizes"); + } + Slice value_sizes_section(input_ref.data(), value_sizes_bytes); + uint32_t first_value_size = 0; + if (!GetVarint32(&value_sizes_section, &first_value_size)) { + return Status::Corruption("Error decoding wide column value size"); + } + // Skip entire VALUE SIZES section using value_sizes_bytes + input_ref.remove_prefix(value_sizes_bytes); + + // Check if the first column is the default column (empty name) + if (first_name_size != 0) { + value.clear(); + return Status::OK(); + } + + // Skip NAMES section + if (input_ref.size() < names_bytes) { + return Status::Corruption("Error decoding wide column names"); + } + input_ref.remove_prefix(names_bytes); + + // Read the first value from VALUES section + if (input_ref.size() < first_value_size) { + return Status::Corruption("Error decoding wide column value payload"); + } + value = Slice(input_ref.data(), first_value_size); + return Status::OK(); + } + + // V1 fallback: full deserialization WideColumns columns; - const Status s = Deserialize(input, columns); - if (!s.ok()) { + if (Status s = Deserialize(input, columns); !s.ok()) { return s; } @@ -148,4 +623,145 @@ Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input, return Status::OK(); } +Status WideColumnSerialization::ResolveEntityBlobColumns( + const Slice& entity_value, const Slice& user_key, + const BlobFetcher* blob_fetcher, PrefetchBufferCollection* prefetch_buffers, + std::string& resolved_entity, bool& resolved, uint64_t* total_bytes_read, + uint64_t* num_blobs_resolved) { + assert(blob_fetcher); + + resolved = false; + + std::vector columns; + std::vector> blob_columns; + + Slice input_copy = entity_value; + if (Status s = DeserializeV2(input_copy, columns, blob_columns); !s.ok()) { + return s; + } + + if (blob_columns.empty()) { + return Status::OK(); + } + + resolved = true; + + // Fetch each blob value + std::vector resolved_blob_values; + resolved_blob_values.reserve(blob_columns.size()); + + for (const auto& blob_col : blob_columns) { + const BlobIndex& blob_idx = blob_col.second; + + if (blob_idx.IsInlined()) { + resolved_blob_values.emplace_back(blob_idx.value().data(), + blob_idx.value().size()); + continue; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer( + blob_idx.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + PinnableSlice blob_value; + const Status fetch_s = blob_fetcher->FetchBlob( + user_key, blob_idx, prefetch_buffer, &blob_value, &bytes_read); + if (!fetch_s.ok()) { + return fetch_s; + } + + resolved_blob_values.emplace_back(blob_value.data(), blob_value.size()); + + if (total_bytes_read) { + *total_bytes_read += bytes_read; + } + } + + if (num_blobs_resolved) { + *num_blobs_resolved += blob_columns.size(); + } + + return SerializeResolvedEntity(columns, blob_columns, resolved_blob_values, + resolved_entity); +} + +Status WideColumnSerialization::GetValueOfDefaultColumnResolvingBlobs( + const Slice& entity_value, const Slice& user_key, + const BlobFetcher* blob_fetcher, PinnableSlice& result, bool& resolved) { + assert(blob_fetcher); + + resolved = false; + + std::vector columns; + std::vector> blob_columns; + + Slice input_copy = entity_value; + if (Status s = DeserializeV2(input_copy, columns, blob_columns); !s.ok()) { + return s; + } + + // The default column (empty name) is always at index 0 when present + // (columns are sorted by name). + if (columns.empty() || columns[0].name() != kDefaultWideColumnName) { + result.PinSelf(Slice()); + return Status::OK(); + } + + // Check if the default column (index 0) is a blob reference + for (const auto& blob_col : blob_columns) { + if (blob_col.first == 0) { + const BlobIndex& blob_idx = blob_col.second; + + resolved = true; + + if (blob_idx.IsInlined()) { + result.PinSelf(blob_idx.value()); + return Status::OK(); + } + + return blob_fetcher->FetchBlob(user_key, blob_idx, + nullptr /* prefetch_buffer */, &result, + nullptr /* bytes_read */); + } + } + + // Default column is inline + result.PinSelf(columns[0].value()); + return Status::OK(); +} + +Status WideColumnSerialization::SerializeResolvedEntity( + const std::vector& columns, + const std::vector>& blob_columns, + const std::vector& resolved_blob_values, std::string& output) { + assert(blob_columns.size() == resolved_blob_values.size()); + + // blob_columns is sorted by column index and typically small, so use a + // linear scan with a cursor instead of an unordered_map. + size_t blob_cursor = 0; + + // Build result columns with resolved blob values + WideColumns result_columns; + result_columns.reserve(columns.size()); + + for (size_t i = 0; i < columns.size(); ++i) { + if (blob_cursor < blob_columns.size() && + blob_columns[blob_cursor].first == i) { + // This is a blob column - use the resolved value + result_columns.emplace_back(columns[i].name(), + Slice(resolved_blob_values[blob_cursor])); + ++blob_cursor; + } else { + // This is an inline column - use the original value + result_columns.emplace_back(columns[i].name(), columns[i].value()); + } + } + + // Serialize using V1 format (all values inline) + return Serialize(result_columns, output); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/wide/wide_column_serialization.h b/db/wide/wide_column_serialization.h index 4a97f6a78690..0a819907ae7e 100644 --- a/db/wide/wide_column_serialization.h +++ b/db/wide/wide_column_serialization.h @@ -6,18 +6,28 @@ #pragma once #include +#include #include +#include +#include +#include "db/dbformat.h" #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/status.h" #include "rocksdb/wide_columns.h" namespace ROCKSDB_NAMESPACE { +class BlobFetcher; +class BlobIndex; +class FilePrefetchBuffer; +class PinnableSlice; +class PrefetchBufferCollection; class Slice; // Wide-column serialization/deserialization primitives. // +// Version 1 Layout: // The two main parts of the layout are 1) a sorted index containing the column // names and column value sizes and 2) the column values themselves. Keeping the // index and the values separate will enable selectively reading column values @@ -40,16 +50,224 @@ class Slice; // ...---+----------+-------+----------+-------+---...---+-------+ // | varint32 | bytes | varint32 | bytes | | bytes | // ...---+----------+-------+----------+-------+---...---+-------+ +// +// Version 2 Layout (with blob index support): +// Groups all metadata upfront before variable-length data. This enables +// efficient access patterns: index-based value access skips name data +// entirely, default column access is O(1), and type checks are O(1). +// +// Legend: cn = column name, cv = column value, cns = column name size, +// cvs = column value size, ct = column type. +// +// Section 1: HEADER (2 varints) +// +----------+--------------+ +// | version | # of columns | +// | varint32 | varint32 | +// +----------+--------------+ +// +// Section 2: SKIP INFO (3 varints) +// +-------------------+---------------------+------------------+ +// | name_sizes_bytes | value_sizes_bytes | names_bytes | +// | varint32 | varint32 | varint32 | +// +-------------------+---------------------+------------------+ +// name_sizes_bytes = byte size of NAME SIZES section (section 4) +// value_sizes_bytes = byte size of VALUE SIZES section (section 5) +// names_bytes = byte size of NAMES section (section 6) +// +// Placed immediately after the header so that header + skip info form +// a contiguous varint sequence (5 varints), enabling future SIMD-based +// varint decoding. +// +// Section 3: COLUMN TYPES (N bytes, fixed-size) +// +------+------+---...---+--------+ +// | ct_0 | ct_1 | | ct_N-1 | +// | byte | byte | | byte | +// +------+------+---...---+--------+ +// ct values are ValueType entries from db/dbformat.h, e.g.: +// kTypeValue (0x01) = inline value +// kTypeBlobIndex (0x11) = blob index reference +// Future per-column types (kTypeMerge, kTypeDeletion, etc.) can be +// added without format changes. +// +// Section 4: NAME SIZES (N varints) +// +----------+----------+---...---+------------+ +// | cns_0 | cns_1 | | cns_{N-1} | +// | varint32 | varint32 | | varint32 | +// +----------+----------+---...---+------------+ +// +// Section 5: VALUE SIZES (N varints) +// +----------+----------+---...---+------------+ +// | cvs_0 | cvs_1 | | cvs_{N-1} | +// | varint32 | varint32 | | varint32 | +// +----------+----------+---...---+------------+ +// +// Section 6: COLUMN NAMES (concatenated, sorted) +// +------+------+---...---+--------+ +// | cn_0 | cn_1 | | cn_N-1 | +// | bytes| bytes| | bytes | +// +------+------+---...---+--------+ +// +// Section 7: COLUMN VALUES (concatenated) +// +------+------+---...---+--------+ +// | cv_0 | cv_1 | | cv_N-1 | +// | bytes| bytes| | bytes | +// +------+------+---...---+--------+ +// +// When ct = kTypeBlobIndex, the cv contains a serialized BlobIndex. class WideColumnSerialization { public: + // Version constants for wide column serialization format. + // - kVersion1: Original format with inline column values only. + // - kVersion2: Extended format that supports blob index references in + // columns. Used when large column values are stored in blob + // files. + static constexpr uint32_t kVersion1 = 1; + static constexpr uint32_t kVersion2 = 2; + + // Serialize columns using version 1 format (no blob support) static Status Serialize(const WideColumns& columns, std::string& output); + // Serialize columns with some columns replaced by blob indices (version 2) + // columns: vector of (column_name, column_value) pairs + // blob_columns: vector of (column_index, blob_index) pairs indicating which + // columns should be stored as blob references + static Status SerializeV2( + const std::vector>& columns, + const std::vector>& blob_columns, + std::string& output); + + // Overload that takes Slice-based WideColumns directly, avoiding the + // need to copy column names and values into string pairs. + static Status SerializeV2( + const WideColumns& columns, + const std::vector>& blob_columns, + std::string& output); + + // Deserialize columns (version 1 format only) static Status Deserialize(Slice& input, WideColumns& columns); + // Deserialize columns and separate inline columns from blob columns + // columns: receives inline column values + // blob_columns: receives (column_index, blob_index) pairs for blob references + static Status DeserializeV2( + Slice& input, std::vector& columns, + std::vector>& blob_columns); + + // Check if the serialized entity has any blob column references. + // Sets *has_blob_columns to true if version >= 2 and at least one column + // has blob type; false otherwise. + // Returns Status::Corruption on decode errors. + static Status HasBlobColumns(const Slice& input, bool& has_blob_columns); + static Status GetValueOfDefaultColumn(Slice& input, Slice& value); - static constexpr uint32_t kCurrentVersion = 1; + // Resolves all blob column references in a V2 wide-column entity, + // fetches the blob values, and re-serializes as a V1 entity (all inline). + // Handles inlined blobs (IsInlined) defensively. + // + // Used by the read path (GetContext, DBIter) when a V2 entity with blob + // column references needs to be converted to V1 format for consumption by + // APIs that only support V1 (e.g., TimedFullMerge, + // PinnableWideColumns::SetWideColumnValue). + // + // Sets *resolved to false and leaves resolved_entity unchanged when + // no blob columns are present. + // + // Optional parameters: + // prefetch_buffers - for prefetch optimization (nullptr = no prefetch) + // total_bytes_read - accumulates bytes read from blob files (nullptr = + // skip) num_blobs_resolved - count of blob columns resolved (nullptr = + // skip) + static Status ResolveEntityBlobColumns( + const Slice& entity_value, const Slice& user_key, + const BlobFetcher* blob_fetcher, + PrefetchBufferCollection* prefetch_buffers, std::string& resolved_entity, + bool& resolved, uint64_t* total_bytes_read, uint64_t* num_blobs_resolved); + + // Extracts the default column value from a V2 entity, resolving its + // blob reference if needed. The default column (empty name) is always + // at index 0 when present (columns are sorted). + // + // Sets result to the resolved default column value (fetching from blob + // file if it's a blob reference). If there is no default column, result + // is set to empty. Sets *resolved to true if a blob was found for the + // default column, false otherwise. + static Status GetValueOfDefaultColumnResolvingBlobs( + const Slice& entity_value, const Slice& user_key, + const BlobFetcher* blob_fetcher, PinnableSlice& result, bool& resolved); + + private: + friend class WideColumnSerializationTest; + // Get the serialization version from the input. + // Sets *version to the version number. + // Returns Status::Corruption on decode errors. + static Status GetVersion(const Slice& input, uint32_t& version); + + // Merges deserialized columns with resolved blob values and serializes + // the result using version 1 format (all values inline). + static Status SerializeResolvedEntity( + const std::vector& columns, + const std::vector>& blob_columns, + const std::vector& resolved_blob_values, + std::string& output); + + // Returns InvalidArgument with the given message if size exceeds uint32_t. + static Status ValidateWideColumnLimit(size_t size, const char* msg) { + if (size > static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument(msg); + } + return Status::OK(); + } + + // Returns Corruption if prev_name >= name (columns must be strictly ordered). + static Status ValidateColumnOrder(const Slice& prev_name, const Slice& name) { + if (prev_name.compare(name) >= 0) { + return Status::Corruption("Wide columns out of order"); + } + return Status::OK(); + } + + // Shared implementation for both SerializeV2 overloads. + // get_name(i): returns Slice for column i's name + // get_value(i): returns Slice for column i's inline value + template + static Status SerializeV2Impl( + size_t num_columns, + const std::vector>& blob_columns, + std::string& output, GetName get_name, GetValue get_value); + + // Validates num_columns limit and builds a per-column lookup map from + // blob_columns. Returns InvalidArgument on validation failure. + static Status BuildBlobIndexMap( + size_t num_columns, + const std::vector>& blob_columns, + std::vector& blob_index_map); + + // Parses V1 layout (interleaved name/value_size pairs followed by values) + // into columns. Used by both Deserialize and DeserializeV2 to avoid + // code duplication. + static Status DeserializeV1(Slice& input, uint32_t num_columns, + std::vector& columns); + + // Parses V2 layout sections 2-7 (skip info through values) into columns and + // column types. Used by both Deserialize and DeserializeV2 to avoid + // code duplication. + static Status DeserializeV2Impl(Slice& input, uint32_t num_columns, + std::vector& columns, + std::vector& column_types); + + // Returns true if t is a supported per-column ValueType. Currently only + // kTypeValue (inline) and kTypeBlobIndex are supported. Notably, + // kTypeWideColumnEntity is rejected to prevent recursive nesting. + static bool IsValidColumnValueType(ValueType t) { + return t == kTypeValue || t == kTypeBlobIndex; + } + + // Returns true if any of the first num_columns type bytes equals + // kTypeBlobIndex. Typical entities have <10 columns, so a linear + // scan is sufficient; SIMD could be considered if column counts grow. + static bool ContainsBlobType(const char* type_bytes, uint32_t num_columns); }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/wide/wide_column_serialization_test.cc b/db/wide/wide_column_serialization_test.cc index 83a849da9eb3..018324d855e8 100644 --- a/db/wide/wide_column_serialization_test.cc +++ b/db/wide/wide_column_serialization_test.cc @@ -5,13 +5,35 @@ #include "db/wide/wide_column_serialization.h" +#include +#include + +#include "db/blob/blob_index.h" #include "db/wide/wide_columns_helper.h" #include "test_util/testharness.h" #include "util/coding.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { -TEST(WideColumnSerializationTest, Construct) { +class WideColumnSerializationTest : public testing::Test { + protected: + // Wrappers for private methods accessible via friend declaration. + static Status GetVersion(const Slice& input, uint32_t& version) { + return WideColumnSerialization::GetVersion(input, version); + } + + static Status SerializeResolvedEntity( + const std::vector& columns, + const std::vector>& blob_columns, + const std::vector& resolved_blob_values, + std::string& output) { + return WideColumnSerialization::SerializeResolvedEntity( + columns, blob_columns, resolved_blob_values, output); + } +}; + +TEST_F(WideColumnSerializationTest, Construct) { constexpr char foo[] = "foo"; constexpr char bar[] = "bar"; @@ -87,7 +109,7 @@ TEST(WideColumnSerializationTest, Construct) { } } -TEST(WideColumnSerializationTest, SerializeDeserialize) { +TEST_F(WideColumnSerializationTest, SerializeDeserialize) { WideColumns columns{{"foo", "bar"}, {"hello", "world"}}; std::string output; @@ -126,7 +148,7 @@ TEST(WideColumnSerializationTest, SerializeDeserialize) { } } -TEST(WideColumnSerializationTest, SerializeDuplicateError) { +TEST_F(WideColumnSerializationTest, SerializeDuplicateError) { WideColumns columns{{"foo", "bar"}, {"foo", "baz"}}; std::string output; @@ -134,7 +156,7 @@ TEST(WideColumnSerializationTest, SerializeDuplicateError) { WideColumnSerialization::Serialize(columns, output).IsCorruption()); } -TEST(WideColumnSerializationTest, SerializeOutOfOrderError) { +TEST_F(WideColumnSerializationTest, SerializeOutOfOrderError) { WideColumns columns{{"hello", "world"}, {"foo", "bar"}}; std::string output; @@ -142,7 +164,7 @@ TEST(WideColumnSerializationTest, SerializeOutOfOrderError) { WideColumnSerialization::Serialize(columns, output).IsCorruption()); } -TEST(WideColumnSerializationTest, DeserializeVersionError) { +TEST_F(WideColumnSerializationTest, DeserializeVersionError) { // Can't decode version std::string buf; @@ -155,7 +177,7 @@ TEST(WideColumnSerializationTest, DeserializeVersionError) { ASSERT_TRUE(std::strstr(s.getState(), "version")); } -TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) { +TEST_F(WideColumnSerializationTest, DeserializeUnsupportedVersion) { // Unsupported version constexpr uint32_t future_version = 1000; @@ -170,11 +192,11 @@ TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) { ASSERT_TRUE(std::strstr(s.getState(), "version")); } -TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) { +TEST_F(WideColumnSerializationTest, DeserializeNumberOfColumnsError) { // Can't decode number of columns std::string buf; - PutVarint32(&buf, WideColumnSerialization::kCurrentVersion); + PutVarint32(&buf, WideColumnSerialization::kVersion1); Slice input(buf); WideColumns columns; @@ -184,10 +206,10 @@ TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) { ASSERT_TRUE(std::strstr(s.getState(), "number")); } -TEST(WideColumnSerializationTest, DeserializeColumnsError) { +TEST_F(WideColumnSerializationTest, DeserializeV2Error) { std::string buf; - PutVarint32(&buf, WideColumnSerialization::kCurrentVersion); + PutVarint32(&buf, WideColumnSerialization::kVersion1); constexpr uint32_t num_columns = 2; PutVarint32(&buf, num_columns); @@ -277,10 +299,10 @@ TEST(WideColumnSerializationTest, DeserializeColumnsError) { } } -TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) { +TEST_F(WideColumnSerializationTest, DeserializeV2OutOfOrder) { std::string buf; - PutVarint32(&buf, WideColumnSerialization::kCurrentVersion); + PutVarint32(&buf, WideColumnSerialization::kVersion1); constexpr uint32_t num_columns = 2; PutVarint32(&buf, num_columns); @@ -302,6 +324,521 @@ TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) { ASSERT_TRUE(std::strstr(s.getState(), "order")); } +TEST_F(WideColumnSerializationTest, DeserializeV2RejectsRecursiveType) { + // Manually construct a V2 entity where one column has type + // kTypeWideColumnEntity, which would create recursive nesting. + // Deserialization must reject this. + std::string buf; + + PutVarint32(&buf, WideColumnSerialization::kVersion2); + + constexpr uint32_t num_columns = 2; + PutVarint32(&buf, num_columns); + + // Section 2: COLUMN TYPES -- first column inline, second recursive + buf.push_back(static_cast(kTypeValue)); + buf.push_back(static_cast(kTypeWideColumnEntity)); + + // Section 3: SKIP INFO + PutVarint32(&buf, 2); // name_sizes_bytes (varint(1) + varint(1)) + PutVarint32(&buf, 2); // value_sizes_bytes (varint(3) + varint(5)) + PutVarint32(&buf, 2); // names_bytes ("a" + "b") + + // Section 4: NAME SIZES + PutVarint32(&buf, 1); // "a" + PutVarint32(&buf, 1); // "b" + + // Section 5: VALUE SIZES + PutVarint32(&buf, 3); + PutVarint32(&buf, 5); + + // Section 6: NAMES + buf.append("ab"); + + // Section 7: VALUES (8 bytes of placeholder data) + buf.append(8, 'x'); + + // DeserializeV2 should reject with Corruption + { + Slice input(buf); + std::vector columns; + std::vector> blob_columns; + const Status s = + WideColumnSerialization::DeserializeV2(input, columns, blob_columns); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Unsupported wide column ValueType")); + } + + // Deserialize (V1-only API) should also reject + { + Slice input(buf); + WideColumns columns; + const Status s = WideColumnSerialization::Deserialize(input, columns); + ASSERT_TRUE(s.IsCorruption()); + } +} + +// Helper: create a BlobIndex from EncodeBlob parameters. +static BlobIndex MakeBlobIndex(uint64_t file_number, uint64_t offset, + uint64_t size, + CompressionType compression = kNoCompression) { + std::string encoded; + BlobIndex::EncodeBlob(&encoded, file_number, offset, size, compression); + BlobIndex bi; + Slice s(encoded); + assert(bi.DecodeFrom(s).ok()); + return bi; +} + +// Helper: V2 serialize → DeserializeV2 round-trip, returning +// deserialized columns and blob column info. +static void V2SerializeAndDeserialize( + const std::vector>& columns, + const std::vector>& blob_columns_in, + std::vector* deserialized, + std::vector>* blob_columns_out, + std::string* serialized_out) { + ASSERT_OK(WideColumnSerialization::SerializeV2(columns, blob_columns_in, + *serialized_out)); + + Slice input(*serialized_out); + ASSERT_OK(WideColumnSerialization::DeserializeV2(input, *deserialized, + *blob_columns_out)); + ASSERT_EQ(deserialized->size(), columns.size()); + for (size_t i = 0; i < columns.size(); ++i) { + ASSERT_EQ((*deserialized)[i].name(), columns[i].first); + } +} + +// Helper: build WideColumns from string pairs. +static WideColumns ToWideColumns( + const std::vector>& columns) { + WideColumns wc; + wc.reserve(columns.size()); + for (const auto& col : columns) { + wc.emplace_back(Slice(col.first), Slice(col.second)); + } + return wc; +} + +// Helper: deserialize and verify column names match expected.first +// and column values match expected_values[i]. +static void VerifyDeserialize( + const std::string& serialized, + const std::vector>& expected, + const std::vector& expected_values) { + Slice input(serialized); + WideColumns deserialized; + ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized)); + ASSERT_EQ(deserialized.size(), expected.size()); + for (size_t i = 0; i < expected.size(); ++i) { + ASSERT_EQ(deserialized[i].name(), expected[i].first); + ASSERT_EQ(deserialized[i].value(), expected_values[i]); + } +} + +// Convenience overload: values come from expected[i].second. +static void VerifyDeserialize( + const std::string& serialized, + const std::vector>& expected) { + std::vector values; + values.reserve(expected.size()); + for (const auto& col : expected) { + values.push_back(col.second); + } + VerifyDeserialize(serialized, expected, values); +} + +// Helper: create a random non-inlined BlobIndex using the given RNG. +// Only creates Blob or BlobTTL types (not InlinedTTL), because InlinedTTL +// stores a Slice pointing into the encoded string, which would become a +// dangling reference after this function returns. +static BlobIndex MakeRandomBlobIndex(Random& rng) { + std::string bi_str; + if (rng.Uniform(2) == 0) { + BlobIndex::EncodeBlob(&bi_str, rng.Uniform(1000), rng.Uniform(10000), + rng.Uniform(5000), kNoCompression); + } else { + BlobIndex::EncodeBlobTTL(&bi_str, rng.Uniform(1000000), rng.Uniform(1000), + rng.Uniform(10000), rng.Uniform(5000), + kSnappyCompression); + } + BlobIndex bi; + Slice s(bi_str); + assert(bi.DecodeFrom(s).ok()); + return bi; +} + +// Helper: V2 serialize with no blobs then GetValueOfDefaultColumn. +static void VerifyGetDefaultColumn( + const std::vector>& columns, + const Slice& expected_value) { + std::vector> no_blobs; + std::string serialized; + ASSERT_OK( + WideColumnSerialization::SerializeV2(columns, no_blobs, serialized)); + + Slice input(serialized); + Slice value; + ASSERT_OK(WideColumnSerialization::GetValueOfDefaultColumn(input, value)); + ASSERT_EQ(value, expected_value); +} + +TEST_F(WideColumnSerializationTest, SerializeResolvedEntity) { + // Test resolve with mixed, all-blob, and no-blob configurations + struct TestCase { + std::vector> columns; + std::vector> blob_cols; + std::vector resolved_values; + std::vector expected_values; + }; + + std::vector cases = { + // Mixed inline and blob + {.columns = {{"a", "inline_a"}, {"b", "ph"}, {"c", "inline_c"}}, + .blob_cols = {{1, MakeBlobIndex(50, 500, 100)}}, + .resolved_values = {"resolved_b"}, + .expected_values = {"inline_a", "resolved_b", "inline_c"}}, + // All blob columns + {.columns = {{"x", "ph1"}, {"y", "ph2"}, {"z", "ph3"}}, + .blob_cols = {{0, MakeBlobIndex(10, 100, 50)}, + {1, MakeBlobIndex(20, 200, 60)}, + {2, MakeBlobIndex(30, 300, 70)}}, + .resolved_values = {"val_x", "val_y", "val_z"}, + .expected_values = {"val_x", "val_y", "val_z"}}, + // No blob columns + {.columns = {{"alpha", "val_alpha"}, {"beta", "val_beta"}}, + .blob_cols = {}, + .resolved_values = {}, + .expected_values = {"val_alpha", "val_beta"}}, + }; + + for (const auto& tc : cases) { + std::string serialized; + std::vector deserialized; + std::vector> blob_out; + V2SerializeAndDeserialize(tc.columns, tc.blob_cols, &deserialized, + &blob_out, &serialized); + + std::string resolved_output; + ASSERT_OK(WideColumnSerializationTest::SerializeResolvedEntity( + deserialized, blob_out, tc.resolved_values, resolved_output)); + + uint32_t v = 0; + ASSERT_OK(GetVersion(Slice(resolved_output), v)); + ASSERT_EQ(v, WideColumnSerialization::kVersion1); + + VerifyDeserialize(resolved_output, tc.columns, tc.expected_values); + } +} + +TEST_F(WideColumnSerializationTest, V2GetValueOfDefaultColumn) { + // V2 with default column present + VerifyGetDefaultColumn({{"", "default_value"}, {"col1", "value1"}}, + "default_value"); + // V2 without default column + VerifyGetDefaultColumn({{"col1", "value1"}, {"col2", "value2"}}, Slice()); + // V2 with zero columns + VerifyGetDefaultColumn({}, Slice()); + + // V1 fallback + { + WideColumns columns{{"", "v1_default"}, {"col1", "v1"}}; + std::string serialized; + ASSERT_OK(WideColumnSerialization::Serialize(columns, serialized)); + + Slice input(serialized); + Slice value; + ASSERT_OK(WideColumnSerialization::GetValueOfDefaultColumn(input, value)); + ASSERT_EQ(value, "v1_default"); + } +} + +TEST_F(WideColumnSerializationTest, V2BlobColumnRejectsDeserialize) { + std::vector> columns = { + {"a", "inline"}, {"b", "placeholder"}}; + std::vector> blob_columns = { + {1, MakeBlobIndex(1, 2, 3)}}; + + std::string serialized; + ASSERT_OK( + WideColumnSerialization::SerializeV2(columns, blob_columns, serialized)); + + Slice input(serialized); + WideColumns deserialized; + ASSERT_TRUE(WideColumnSerialization::Deserialize(input, deserialized) + .IsNotSupported()); +} + +TEST_F(WideColumnSerializationTest, V2GetValueOfDefaultColumnBlobRef) { + // When default column (index 0) is a blob reference, + // GetValueOfDefaultColumn should return NotSupported. + std::vector> columns = { + {"", "placeholder"}, {"col1", "value1"}}; + std::vector> blob_columns = { + {0, MakeBlobIndex(10, 100, 500)}}; + + std::string serialized; + ASSERT_OK( + WideColumnSerialization::SerializeV2(columns, blob_columns, serialized)); + + Slice input(serialized); + Slice value; + ASSERT_TRUE(WideColumnSerialization::GetValueOfDefaultColumn(input, value) + .IsNotSupported()); +} + +TEST_F(WideColumnSerializationTest, SerializeV2Errors) { + // Blob column index out of range + { + std::vector> columns = {{"a", "val"}}; + std::vector> blob_columns = { + {5, MakeBlobIndex(1, 2, 3)}}; // index 5 but only 1 column + + std::string output; + ASSERT_TRUE( + WideColumnSerialization::SerializeV2(columns, blob_columns, output) + .IsInvalidArgument()); + } + + // Columns out of order (V2) + { + std::vector> columns = {{"b", "val_b"}, + {"a", "val_a"}}; + std::vector> no_blobs; + + std::string output; + ASSERT_TRUE(WideColumnSerialization::SerializeV2(columns, no_blobs, output) + .IsCorruption()); + } + + // Duplicate column names (V2) + { + std::vector> columns = {{"a", "val1"}, + {"a", "val2"}}; + std::vector> no_blobs; + + std::string output; + ASSERT_TRUE(WideColumnSerialization::SerializeV2(columns, no_blobs, output) + .IsCorruption()); + } +} + +TEST_F(WideColumnSerializationTest, BlobIndexEncodeToRoundTrip) { + // Test EncodeTo produces identical output to static Encode methods + // for all three blob index types. + auto verify_encode_to = [](const std::string& encoded_static) { + BlobIndex bi; + Slice s(encoded_static); + ASSERT_OK(bi.DecodeFrom(s)); + std::string encoded_instance; + bi.EncodeTo(&encoded_instance); + ASSERT_EQ(encoded_static, encoded_instance); + }; + + std::string blob_str; + std::string blob_ttl_str; + std::string inlined_str; + BlobIndex::EncodeBlob(&blob_str, 42, 1024, 2048, kSnappyCompression); + BlobIndex::EncodeBlobTTL(&blob_ttl_str, 9999, 10, 200, 3000, + kZlibCompression); + BlobIndex::EncodeInlinedTTL(&inlined_str, 12345, "inline_data"); + + verify_encode_to(blob_str); + verify_encode_to(blob_ttl_str); + verify_encode_to(inlined_str); +} + +TEST_F(WideColumnSerializationTest, V2LayoutStructureVerification) { + // Verify the V2 binary layout structure by manually parsing sections + std::vector> columns = { + {"aa", "val_aa"}, {"bbb", "val_bbb"}}; + std::vector> empty_blob_columns; + + std::string serialized; + ASSERT_OK(WideColumnSerialization::SerializeV2(columns, empty_blob_columns, + serialized)); + + Slice data(serialized); + + // Section 1: HEADER + uint32_t version = 0; + ASSERT_TRUE(GetVarint32(&data, &version)); + ASSERT_EQ(version, WideColumnSerialization::kVersion2); + + uint32_t num_columns = 0; + ASSERT_TRUE(GetVarint32(&data, &num_columns)); + ASSERT_EQ(num_columns, 2u); + + // Section 2: SKIP INFO (3 varints) + uint32_t name_sizes_bytes = 0; + uint32_t value_sizes_bytes = 0; + uint32_t names_bytes = 0; + ASSERT_TRUE(GetVarint32(&data, &name_sizes_bytes)); + ASSERT_TRUE(GetVarint32(&data, &value_sizes_bytes)); + ASSERT_TRUE(GetVarint32(&data, &names_bytes)); + // name sizes: varint(2) + varint(3) = 1 + 1 = 2 bytes + ASSERT_EQ(name_sizes_bytes, 2u); + // value sizes: varint(6) + varint(7) = 1 + 1 = 2 bytes + ASSERT_EQ(value_sizes_bytes, 2u); + // names: "aa" + "bbb" = 2 + 3 = 5 bytes + ASSERT_EQ(names_bytes, 5u); + + // Section 3: COLUMN TYPES (2 bytes, both inline) + ASSERT_GE(data.size(), 2u); + ASSERT_EQ(static_cast(data[0]), static_cast(kTypeValue)); + ASSERT_EQ(static_cast(data[1]), static_cast(kTypeValue)); + data.remove_prefix(2); + + // Section 4: NAME SIZES + uint32_t ns0 = 0; + uint32_t ns1 = 0; + ASSERT_TRUE(GetVarint32(&data, &ns0)); + ASSERT_TRUE(GetVarint32(&data, &ns1)); + ASSERT_EQ(ns0, 2u); + ASSERT_EQ(ns1, 3u); + + // Section 5: VALUE SIZES + uint32_t vs0 = 0; + uint32_t vs1 = 0; + ASSERT_TRUE(GetVarint32(&data, &vs0)); + ASSERT_TRUE(GetVarint32(&data, &vs1)); + ASSERT_EQ(vs0, 6u); // "val_aa" = 6 + ASSERT_EQ(vs1, 7u); // "val_bbb" = 7 + + // Section 6: COLUMN NAMES + ASSERT_GE(data.size(), 5u); + ASSERT_EQ(Slice(data.data(), 2), "aa"); + ASSERT_EQ(Slice(data.data() + 2, 3), "bbb"); + data.remove_prefix(5); + + // Section 7: COLUMN VALUES + ASSERT_GE(data.size(), 13u); + ASSERT_EQ(Slice(data.data(), 6), "val_aa"); + ASSERT_EQ(Slice(data.data() + 6, 7), "val_bbb"); +} + +// Randomized correctness test: serialize and deserialize with random column +// counts, name sizes, value sizes, and randomly chosen blob columns. +// Validates the full round-trip for both V1 (Serialize) and V2 +// (SerializeV2) formats. +TEST_F(WideColumnSerializationTest, RandomizedSerializeDeserializeRoundTrip) { + uint32_t seed = static_cast( + std::chrono::system_clock::now().time_since_epoch().count()); + Random rng(seed); + SCOPED_TRACE("seed=" + std::to_string(seed)); + + constexpr int kNumIterations = 100; + + for (int iter = 0; iter < kNumIterations; ++iter) { + int num_cols = rng.Uniform(17); // 0..16 + int name_sz = 1 + rng.Uniform(64); // 1..64 + int val_sz = rng.Uniform(1025); // 0..1024 + + // Generate sorted column names and random values + std::vector> columns; + columns.reserve(num_cols); + for (int c = 0; c < num_cols; ++c) { + // Build a sorted, unique name of exactly name_sz bytes. + // Use hex-encoded index as prefix to guarantee sort order, + // then pad with random characters. + char idx_str[16]; + snprintf(idx_str, sizeof(idx_str), "%04x", c); + std::string name(idx_str); + if (static_cast(name.size()) < name_sz) { + name.append(name_sz - name.size(), + static_cast('a' + rng.Uniform(26))); + } + // Ensure exactly name_sz bytes. For name_sz < 4, use just the + // low-order hex digits to maintain sort order. + if (static_cast(name.size()) > name_sz) { + name = name.substr(name.size() - name_sz); + } + + // Random value content + std::string value(val_sz, '\0'); + for (int j = 0; j < val_sz; ++j) { + value[j] = static_cast(rng.Uniform(256)); + } + columns.emplace_back(std::move(name), std::move(value)); + } + + // Randomly select some columns as blob columns + std::vector> blob_columns; + for (int c = 0; c < num_cols; ++c) { + if (rng.Uniform(3) == 0) { // ~33% chance of being a blob column + blob_columns.emplace_back(c, MakeRandomBlobIndex(rng)); + } + } + + // V2 serialize → DeserializeV2 round-trip + std::string serialized; + std::vector deserialized; + std::vector> blob_out; + V2SerializeAndDeserialize(columns, blob_columns, &deserialized, &blob_out, + &serialized); + + // Verify version and HasBlobColumns + uint32_t v = 0; + ASSERT_OK(GetVersion(Slice(serialized), v)); + ASSERT_EQ(v, WideColumnSerialization::kVersion2); + + bool hb = false; + ASSERT_OK(WideColumnSerialization::HasBlobColumns(Slice(serialized), hb)); + ASSERT_EQ(hb, !blob_columns.empty()); + + // Verify blob column round-trip + ASSERT_EQ(blob_out.size(), blob_columns.size()); + for (size_t b = 0; b < blob_columns.size(); ++b) { + ASSERT_EQ(blob_out[b].first, blob_columns[b].first); + const BlobIndex& orig = blob_columns[b].second; + const BlobIndex& decoded = blob_out[b].second; + ASSERT_EQ(decoded.IsInlined(), orig.IsInlined()); + ASSERT_EQ(decoded.HasTTL(), orig.HasTTL()); + if (!decoded.IsInlined()) { + ASSERT_EQ(decoded.file_number(), orig.file_number()); + ASSERT_EQ(decoded.offset(), orig.offset()); + ASSERT_EQ(decoded.size(), orig.size()); + } + } + + // Verify inline column values + size_t blob_idx = 0; + for (int c = 0; c < num_cols; ++c) { + if (blob_idx < blob_columns.size() && + blob_columns[blob_idx].first == static_cast(c)) { + ++blob_idx; + } else { + ASSERT_EQ(deserialized[c].value(), columns[c].second); + } + } + + // If no blob columns, also verify Deserialize() and both overloads + if (blob_columns.empty()) { + VerifyDeserialize(serialized, columns); + + // WideColumns overload should produce identical output + std::string serialized2; + WideColumns wc = ToWideColumns(columns); + ASSERT_OK( + WideColumnSerialization::SerializeV2(wc, blob_columns, serialized2)); + ASSERT_EQ(serialized, serialized2); + } + + // V1 Serialize round-trip + { + WideColumns wc = ToWideColumns(columns); + std::string serialized_v1; + ASSERT_OK(WideColumnSerialization::Serialize(wc, serialized_v1)); + + ASSERT_OK(GetVersion(Slice(serialized_v1), v)); + ASSERT_EQ(v, WideColumnSerialization::kVersion1); + + VerifyDeserialize(serialized_v1, columns); + } + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/write_batch.cc b/db/write_batch.cc index 15034e5c3fcc..c2f7a7eddf51 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -551,9 +551,6 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, if (LIKELY(!s.IsTryAgain())) { last_was_try_again = false; - tag = 0; - column_family = 0; // default - s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, &blob, &xid, &write_unix_time); if (!s.ok()) { @@ -815,6 +812,12 @@ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize( s = Status::InvalidArgument("Default cf timestamp size mismatch"); } } + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (cfd && cfd->ioptions().disallow_memtable_writes) { + s = Status::InvalidArgument( + "This column family has disallow_memtable_writes=true"); + } } else if (b->default_cf_ts_sz_ > 0) { ts_sz = b->default_cf_ts_sz_; } @@ -836,6 +839,12 @@ Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family, if (cf_ts_sz != ts.size()) { return Status::InvalidArgument("timestamp size mismatch"); } + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (cfd && cfd->ioptions().disallow_memtable_writes) { + return Status::InvalidArgument( + "This column family has disallow_memtable_writes=true"); + } return Status::OK(); } } // anonymous namespace @@ -1885,7 +1894,6 @@ Status WriteBatch::VerifyChecksum() const { // ReadRecordFromWriteBatch key.clear(); value.clear(); - column_family = 0; s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, &blob, &xid, /*write_unix_time=*/nullptr); if (!s.ok()) { @@ -2185,6 +2193,13 @@ class MemTableInserter : public WriteBatch::Handler { } return false; } + auto* current = cf_mems_->current(); + if (current && current->ioptions().disallow_memtable_writes) { + *s = Status::InvalidArgument( + "This column family has disallow_memtable_writes=true"); + return false; + } + if (recovering_log_number_ != 0 && recovering_log_number_ < cf_mems_->GetLogNumber()) { // This is true only in recovery environment (recovering_log_number_ is @@ -3195,11 +3210,11 @@ Status WriteBatchInternal::InsertInto( ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db, - bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) { + bool seq_per_batch, bool batch_per_txn) { MemTableInserter inserter( sequence, memtables, flush_scheduler, trim_history_scheduler, ignore_missing_column_families, recovery_log_number, db, - concurrent_memtable_writes, nullptr /* prot_info */, + /*concurrent_memtable_writes=*/false, nullptr /* prot_info */, nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn); for (auto w : write_group) { if (w->CallbackFailed()) { diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 3cf3f4689a8c..f7b36a4133cf 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -185,18 +185,19 @@ class WriteBatchInternal { // If flush_scheduler is non-null, it will be invoked if the memtable // should be flushed. // - // Under concurrent use, the caller is responsible for making sure that - // the memtables object itself is thread-local. + // This overload is for non-concurrent insertion only. static Status InsertInto( WriteThread::WriteGroup& write_group, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families = false, uint64_t log_number = 0, - DB* db = nullptr, bool concurrent_memtable_writes = false, - bool seq_per_batch = false, bool batch_per_txn = true); + DB* db = nullptr, bool seq_per_batch = false, bool batch_per_txn = true); // Convenience form of InsertInto when you have only one batch // next_seq returns the seq after last sequence number used in MemTable insert + // + // Under concurrent use, the caller is responsible for making sure that + // the memtables object itself is thread-local. static Status InsertInto( const WriteBatch* batch, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index 53094eca4b9b..4fd1d8bcdc65 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -419,7 +419,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) { WriteOptions write_options; ReadOptions read_options; string value; - DB* db; + std::unique_ptr db; DBImpl* db_impl; ASSERT_OK(DestroyDB(dbname, options)); @@ -428,7 +428,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) { Status s = DB::Open(options, dbname, &db); ASSERT_OK(s); - db_impl = dynamic_cast(db); + db_impl = dynamic_cast(db.get()); ASSERT_TRUE(db_impl); WriteBatch wb; @@ -481,7 +481,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) { ASSERT_TRUE(user_write_cb.write_enqueued_.load()); ASSERT_TRUE(user_write_cb.wal_write_done_.load()); - delete db; + db.reset(); ASSERT_OK(DestroyDB(dbname, options)); } diff --git a/db/write_thread.h b/db/write_thread.h index 42256970f413..6c2dc5dcd02a 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -132,7 +132,7 @@ class WriteThread { size_t protection_bytes_per_key; PreReleaseCallback* pre_release_callback; PostMemTableCallback* post_memtable_callback; - uint64_t log_used; // log number that this batch was inserted into + uint64_t wal_used; // log number that this batch was inserted into uint64_t log_ref; // log number that memtable insert should reference WriteCallback* callback; UserWriteCallback* user_write_cb; @@ -161,7 +161,7 @@ class WriteThread { protection_bytes_per_key(0), pre_release_callback(nullptr), post_memtable_callback(nullptr), - log_used(0), + wal_used(0), log_ref(0), callback(nullptr), user_write_cb(nullptr), @@ -179,7 +179,7 @@ class WriteThread { PostMemTableCallback* _post_memtable_callback = nullptr, bool _ingest_wbwi = false) : batch(_batch), - // TODO: store a copy of WriteOptions instead of its seperated data + // TODO: store a copy of WriteOptions instead of its separated data // members sync(write_options.sync), no_slowdown(write_options.no_slowdown), @@ -190,7 +190,7 @@ class WriteThread { protection_bytes_per_key(_batch->GetProtectionBytesPerKey()), pre_release_callback(_pre_release_callback), post_memtable_callback(_post_memtable_callback), - log_used(0), + wal_used(0), log_ref(_log_ref), callback(_callback), user_write_cb(_user_write_cb), diff --git a/db_stress_tool/CMakeLists.txt b/db_stress_tool/CMakeLists.txt index be34778ddd44..90200f342bf4 100644 --- a/db_stress_tool/CMakeLists.txt +++ b/db_stress_tool/CMakeLists.txt @@ -2,13 +2,14 @@ add_executable(db_stress${ARTIFACT_SUFFIX} batched_ops_stress.cc cf_consistency_stress.cc db_stress.cc + db_stress_compaction_service.cc + db_stress_compression_manager.cc db_stress_common.cc db_stress_driver.cc db_stress_filters.cc db_stress_gflags.cc db_stress_listener.cc db_stress_shared_state.cc - db_stress_stat.cc db_stress_test_base.cc db_stress_wide_merge_operator.cc db_stress_tool.cc diff --git a/db_stress_tool/cf_consistency_stress.cc b/db_stress_tool/cf_consistency_stress.cc index 1df4fc7cb7fc..d18c47281a69 100644 --- a/db_stress_tool/cf_consistency_stress.cc +++ b/db_stress_tool/cf_consistency_stress.cc @@ -1047,7 +1047,7 @@ class CfConsistencyStressTest : public StressTest { assert(thread); Status status; - DB* db_ptr = secondary_db_ ? secondary_db_ : db_; + DB* db_ptr = secondary_db_ ? secondary_db_.get() : db_; const auto& cfhs = secondary_db_ ? secondary_cfhs_ : column_families_; // Take a snapshot to preserve the state of primary db. diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc index 968a6c16c0f8..c26401352234 100644 --- a/db_stress_tool/db_stress_common.cc +++ b/db_stress_tool/db_stress_common.cc @@ -13,6 +13,7 @@ #include +#include "file/file_util.h" #include "rocksdb/secondary_cache.h" #include "util/file_checksum_helper.h" #include "util/xxhash.h" @@ -228,6 +229,280 @@ void CompressedCacheSetCapacityThread(void* v) { } } +#ifndef NDEBUG +static void SetupFaultInjectionForRemoteCompaction(SharedState* shared) { + if (!fault_fs_guard) { + return; + } + + fault_fs_guard->SetThreadLocalErrorContext( + FaultInjectionIOType::kRead, shared->GetSeed(), FLAGS_read_fault_one_in, + FLAGS_inject_error_severity == 1 /* retryable */, + FLAGS_inject_error_severity == 2 /* has_data_loss*/); + fault_fs_guard->EnableThreadLocalErrorInjection(FaultInjectionIOType::kRead); + + fault_fs_guard->SetThreadLocalErrorContext( + FaultInjectionIOType::kWrite, shared->GetSeed(), FLAGS_write_fault_one_in, + FLAGS_inject_error_severity == 1 /* retryable */, + FLAGS_inject_error_severity == 2 /* has_data_loss*/); + fault_fs_guard->EnableThreadLocalErrorInjection(FaultInjectionIOType::kWrite); + + fault_fs_guard->SetThreadLocalErrorContext( + FaultInjectionIOType::kMetadataRead, shared->GetSeed(), + FLAGS_metadata_read_fault_one_in, + FLAGS_inject_error_severity == 1 /* retryable */, + FLAGS_inject_error_severity == 2 /* has_data_loss*/); + fault_fs_guard->EnableThreadLocalErrorInjection( + FaultInjectionIOType::kMetadataRead); + + fault_fs_guard->SetThreadLocalErrorContext( + FaultInjectionIOType::kMetadataWrite, shared->GetSeed(), + FLAGS_metadata_write_fault_one_in, + FLAGS_inject_error_severity == 1 /* retryable */, + FLAGS_inject_error_severity == 2 /* has_data_loss*/); + fault_fs_guard->EnableThreadLocalErrorInjection( + FaultInjectionIOType::kMetadataWrite); +} +#endif // NDEBUG + +static CompactionServiceOptionsOverride CreateOverrideOptions( + const Options& options, const CompactionServiceJobInfo& job_info) { + CompactionServiceOptionsOverride override_options{ + .env = db_stress_env, + .file_checksum_gen_factory = options.file_checksum_gen_factory, + .merge_operator = options.merge_operator, + .compaction_filter = options.compaction_filter, + .compaction_filter_factory = options.compaction_filter_factory, + .prefix_extractor = options.prefix_extractor, + .sst_partitioner_factory = options.sst_partitioner_factory, + .listeners = options.listeners, + .statistics = options.statistics, + .table_properties_collector_factories = + options.table_properties_collector_factories}; + + // TODO(jaykorean) - create a new compaction filter / merge operator and + // others for remote compactions + // + // Create a new Table Factory + ConfigOptions config_options; + config_options.ignore_unknown_options = false; + config_options.ignore_unsupported_options = false; + + Status s = TableFactory::CreateFromString(config_options, + options.table_factory->Name(), + &override_options.table_factory); + + if (s.ok()) { + std::string options_str; + s = options.table_factory->GetOptionString(config_options, &options_str); + if (s.ok()) { + s = override_options.table_factory->ConfigureFromString(config_options, + options_str); + } + } + + if (!s.ok()) { + fprintf(stdout, + "Failed to set up TableFactory for remote compaction - (%s): %s\n", + job_info.db_name.c_str(), s.ToString().c_str()); + } + + return override_options; +} + +static Status CleanupOutputDirectory(const std::string& output_directory) { +#ifndef NDEBUG + // Temporarily disable fault injection to ensure deletion always succeeds + if (fault_fs_guard) { + fault_fs_guard->DisableAllThreadLocalErrorInjection(); + } +#endif // NDEBUG + + Status s = DestroyDir(db_stress_env, output_directory); + if (!s.ok()) { + fprintf(stderr, + "Failed to destroy output directory %s when allow_resumption is " + "false: %s\n", + output_directory.c_str(), s.ToString().c_str()); + } + + if (s.ok()) { + s = db_stress_env->CreateDir(output_directory); + if (!s.ok()) { + fprintf(stderr, + "Failed to recreate output directory %s when allow_resumption is " + "false: %s\n", + output_directory.c_str(), s.ToString().c_str()); + } + } + +#ifndef NDEBUG + // Re-enable fault injection after deletion + if (fault_fs_guard) { + fault_fs_guard->EnableAllThreadLocalErrorInjection(); + } +#endif // NDEBUG + + return s; +} + +// Set up cancellation mechanism for testing resumable remote compactions. +// Spawns a detached thread to trigger cancellation after a delay (50ms +// initially, or 2/3 of the previous successful compaction time for adaptive +// timing). First-time jobs are always canceled; retries have a 10% chance +// to test consecutive cancellation scenarios. +static std::shared_ptr> SetupCancellation( + OpenAndCompactOptions& open_compact_options, bool was_canceled, + Random& rand, uint64_t successful_compaction_end_to_end_micros) { + auto canceled = std::make_shared>(false); + open_compact_options.canceled = canceled.get(); + + bool should_cancel = !was_canceled || rand.OneIn(10); + + if (should_cancel) { + std::thread interruption_thread( + [canceled, successful_compaction_end_to_end_micros]() { + uint64_t sleep_micros = + successful_compaction_end_to_end_micros == 0 + ? 50000 + : successful_compaction_end_to_end_micros * 2 / 3; + std::this_thread::sleep_for(std::chrono::microseconds(sleep_micros)); + canceled->store(true); + }); + interruption_thread.detach(); + } + + return canceled; +} + +// Process the result of OpenAndCompact operation +static void ProcessCompactionResult( + const Status& s, const std::string& job_id, + const CompactionServiceJobInfo& job_info, + const std::string& serialized_input, const std::string& output_directory, + const std::string& serialized_output, SharedState* shared, + uint64_t& successful_compaction_end_to_end_micros, uint64_t start_micros, + Env* env) { + if (s.IsManualCompactionPaused() && FLAGS_allow_resumption_one_in > 0) { + // Re-enqueue for retry + shared->EnqueueRemoteCompaction(job_id, job_info, serialized_input, + output_directory, true /* was_cancelled */); + return; + } + + if (!s.ok()) { + if (!StressTest::IsErrorInjectedAndRetryable(s)) { + // Print in stdout instead of stderr to avoid stress test failure, + // because OpenAndCompact() failure doesn't necessarily mean + // primary db instance failure. + fprintf(stdout, "Failed to run OpenAndCompact(%s): %s\n", + job_info.db_name.c_str(), s.ToString().c_str()); + } + } else { + // Track successful completion time + successful_compaction_end_to_end_micros = env->NowMicros() - start_micros; + } + + // Add the output regardless of status, so that primary DB doesn't rely + // on the timeout to finish waiting. The actual failure from the + // deserialization can fail the compaction properly + shared->AddRemoteCompactionResult(job_id, s, serialized_output); +} + +static void ProcessRemoteCompactionJob( + const std::string& job_id, const CompactionServiceJobInfo& job_info, + const std::string& serialized_input, const std::string& output_directory, + bool was_canceled, SharedState* shared, StressTest* stress_test, + Random& rand, uint64_t& successful_compaction_end_to_end_micros) { + auto options = stress_test->GetOptions(job_info.cf_id); + assert(options.env != nullptr); + + auto override_options = CreateOverrideOptions(options, job_info); + + OpenAndCompactOptions open_compact_options; + if (FLAGS_allow_resumption_one_in > 0) { + open_compact_options.allow_resumption = + rand.OneIn(FLAGS_allow_resumption_one_in); + } else { + open_compact_options.allow_resumption = false; + } + + if (!open_compact_options.allow_resumption) { + CleanupOutputDirectory(output_directory); + } + + std::shared_ptr> canceled = nullptr; + if (FLAGS_allow_resumption_one_in > 0) { + canceled = SetupCancellation(open_compact_options, was_canceled, rand, + successful_compaction_end_to_end_micros); + } + + std::string serialized_output; + uint64_t start_micros = options.env->NowMicros(); + + Status s = DB::OpenAndCompact(open_compact_options, job_info.db_name, + output_directory, serialized_input, + &serialized_output, override_options); + + ProcessCompactionResult(s, job_id, job_info, serialized_input, + output_directory, serialized_output, shared, + successful_compaction_end_to_end_micros, start_micros, + options.env); +} + +void RemoteCompactionWorkerThread(void* v) { + assert(FLAGS_remote_compaction_worker_threads > 0); + assert(FLAGS_remote_compaction_worker_interval > 0); + + auto* thread = static_cast(v); + SharedState* shared = thread->shared; + StressTest* stress_test = shared->GetStressTest(); + assert(stress_test != nullptr); + +#ifndef NDEBUG + SetupFaultInjectionForRemoteCompaction(shared); +#endif // NDEBUG + + // Tracks the duration (in microseconds) of the most recent successfully + // completed compaction from start to finish. This value is used in + // SetupCancellation() to adaptively set up cancellation point for a + // compaction + uint64_t successful_compaction_end_to_end_micros = 0; + Random rand(static_cast(FLAGS_seed)); + + // Main worker loop + while (true) { + // Check if we should stop + { + MutexLock l(shared->GetMutex()); + if (shared->ShouldStopBgThread()) { + shared->IncBgThreadsFinished(); + if (shared->BgThreadsFinished()) { + shared->GetCondVar()->SignalAll(); + } + return; + } + } + + std::string job_id; + CompactionServiceJobInfo job_info; + std::string serialized_input; + std::string output_directory; + bool was_canceled; + + if (shared->DequeueRemoteCompaction(&job_id, &job_info, &serialized_input, + &output_directory, &was_canceled)) { + ProcessRemoteCompactionJob( + job_id, job_info, serialized_input, output_directory, was_canceled, + shared, stress_test, rand, successful_compaction_end_to_end_micros); + } + + db_stress_env->SleepForMicroseconds( + thread->rand.Next() % FLAGS_remote_compaction_worker_interval * 1000 + + 1); + } +} + void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz) { if (!FLAGS_verbose) { return; @@ -602,5 +877,24 @@ Status DestroyUnverifiedSubdir(const std::string& dirname) { return s; } +Status DbStressDestroyDb(const std::string& db_path) { + Status s; + Options options; + // NOTE: using db_stress_listener_env in order to see obsolete MANIFEST files + options.env = db_stress_listener_env; + // Remove DB files in a principled way to avoid issues + if (FLAGS_use_blob_db) { + s = blob_db::DestroyBlobDB(db_path, options, blob_db::BlobDBOptions()); + } else { + s = DestroyDB(db_path, options); + } + if (!s.ok()) { + return s; + } + // Remove everything else recursively, only reporting success if able to + // delete everything + return DestroyDir(db_stress_listener_env, db_path); +} + } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 0871a87f9e70..fff3720f150d 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -100,13 +100,14 @@ DECLARE_bool(enable_pipelined_write); DECLARE_bool(verify_before_write); DECLARE_bool(histogram); DECLARE_bool(destroy_db_initially); +DECLARE_bool(destroy_db_and_exit); +DECLARE_string(delete_dir_and_exit); DECLARE_bool(verbose); DECLARE_bool(progress_reports); DECLARE_uint64(db_write_buffer_size); DECLARE_int32(write_buffer_size); DECLARE_int32(max_write_buffer_number); DECLARE_int32(min_write_buffer_number_to_merge); -DECLARE_int32(max_write_buffer_number_to_maintain); DECLARE_int64(max_write_buffer_size_to_maintain); DECLARE_bool(use_write_buffer_manager); DECLARE_double(memtable_prefix_bloom_size_ratio); @@ -160,6 +161,8 @@ DECLARE_uint64(periodic_compaction_seconds); DECLARE_string(daily_offpeak_time_utc); DECLARE_uint64(compaction_ttl); DECLARE_bool(fifo_allow_compaction); +DECLARE_uint64(fifo_compaction_max_data_files_size_mb); +DECLARE_bool(fifo_compaction_use_kv_ratio_compaction); DECLARE_bool(allow_concurrent_memtable_write); DECLARE_double(experimental_mempurge_threshold); DECLARE_bool(enable_write_thread_adaptive_yield); @@ -174,6 +177,7 @@ DECLARE_uint32(sqfc_version); DECLARE_bool(use_sqfc_for_range_queries); DECLARE_int32(index_type); DECLARE_int32(data_block_index_type); +DECLARE_int32(index_block_search_type); DECLARE_string(db); DECLARE_string(secondaries_base); DECLARE_bool(test_secondary); @@ -218,6 +222,7 @@ DECLARE_int32(reset_stats_one_in); DECLARE_int32(pause_background_one_in); DECLARE_int32(disable_file_deletions_one_in); DECLARE_int32(disable_manual_compaction_one_in); +DECLARE_int32(abort_and_resume_compactions_one_in); DECLARE_int32(compact_range_width); DECLARE_int32(acquire_snapshot_one_in); DECLARE_bool(compare_full_db_state_snapshot); @@ -249,6 +254,7 @@ DECLARE_string(fs_uri); DECLARE_uint64(ops_per_thread); DECLARE_uint64(log2_keys_per_lock); DECLARE_uint64(max_manifest_file_size); +DECLARE_int32(max_manifest_space_amp_pct); DECLARE_bool(in_place_update); DECLARE_string(memtablerep); DECLARE_int32(prefix_size); @@ -276,6 +282,7 @@ DECLARE_string(last_level_temperature); DECLARE_string(default_write_temperature); DECLARE_string(default_temperature); DECLARE_bool(paranoid_memory_checks); +DECLARE_bool(memtable_veirfy_per_key_checksum_on_seek); // Options for transaction dbs. // Use TransactionDB (a.k.a. Pessimistic Transaction DB) @@ -285,6 +292,7 @@ DECLARE_bool(use_txn); // Options for TransactionDB (a.k.a. Pessimistic Transaction DB) DECLARE_uint64(txn_write_policy); DECLARE_bool(unordered_write); +DECLARE_bool(use_per_key_point_lock_mgr); // Options for OptimisticTransactionDB DECLARE_bool(use_optimistic_txn); @@ -294,11 +302,8 @@ DECLARE_uint32(occ_lock_bucket_count); // Options for StackableDB-based BlobDB DECLARE_bool(use_blob_db); -DECLARE_uint64(blob_db_min_blob_size); -DECLARE_uint64(blob_db_bytes_per_sync); DECLARE_uint64(blob_db_file_size); DECLARE_bool(blob_db_enable_gc); -DECLARE_double(blob_db_gc_cutoff); // Options for integrated BlobDB DECLARE_bool(allow_setting_blob_options_dynamically); @@ -321,7 +326,6 @@ DECLARE_int32(approximate_size_one_in); DECLARE_bool(best_efforts_recovery); DECLARE_bool(skip_verifydb); DECLARE_bool(paranoid_file_checks); -DECLARE_bool(fail_if_options_file_error); DECLARE_uint64(batch_protection_bytes_per_key); DECLARE_uint32(memtable_protection_bytes_per_key); DECLARE_uint32(block_protection_bytes_per_key); @@ -397,9 +401,9 @@ DECLARE_bool(enable_index_compression); DECLARE_uint32(index_shortening); DECLARE_uint32(metadata_charge_policy); DECLARE_bool(use_adaptive_mutex_lru); -DECLARE_uint32(compress_format_version); DECLARE_uint64(manifest_preallocation_size); DECLARE_bool(enable_checksum_handoff); +DECLARE_string(compression_manager); DECLARE_uint64(max_total_wal_size); DECLARE_double(high_pri_pool_ratio); DECLARE_double(low_pri_pool_ratio); @@ -409,6 +413,8 @@ DECLARE_uint64(max_sequential_skip_in_iterations); DECLARE_bool(enable_sst_partitioner_factory); DECLARE_bool(enable_do_not_compress_roles); DECLARE_bool(block_align); +DECLARE_uint64(super_block_alignment_size); +DECLARE_uint64(super_block_alignment_space_overhead_ratio); DECLARE_uint32(lowest_used_cache_tier); DECLARE_bool(enable_custom_split_merge); DECLARE_uint32(adm_policy); @@ -420,10 +426,27 @@ DECLARE_uint32(uncache_aggressiveness); DECLARE_int32(test_ingest_standalone_range_deletion_one_in); DECLARE_bool(allow_unprepared_value); DECLARE_string(file_temperature_age_thresholds); +DECLARE_bool(allow_trivial_copy_when_change_temperature); DECLARE_uint32(commit_bypass_memtable_one_in); DECLARE_bool(track_and_verify_wals); -DECLARE_bool(enable_remote_compaction); +DECLARE_int32(remote_compaction_worker_threads); +DECLARE_int32(remote_compaction_worker_interval); +DECLARE_bool(remote_compaction_failure_fall_back_to_local); +DECLARE_int32(allow_resumption_one_in); DECLARE_bool(auto_refresh_iterator_with_snapshot); +DECLARE_uint32(memtable_op_scan_flush_trigger); +DECLARE_uint32(memtable_avg_op_scan_flush_trigger); +DECLARE_uint32(ingest_wbwi_one_in); +DECLARE_bool(universal_reduce_file_locking); +DECLARE_bool(use_multiscan); +DECLARE_bool(multiscan_use_async_io); + +// Compaction deletion trigger declarations for stress testing +DECLARE_bool(enable_compaction_on_deletion_trigger); +DECLARE_uint64(compaction_on_deletion_min_file_size); +DECLARE_int32(compaction_on_deletion_trigger_count); +DECLARE_int32(compaction_on_deletion_window_size); +DECLARE_double(compaction_on_deletion_ratio); constexpr long KB = 1024; constexpr int kRandomValueMaxFactor = 3; @@ -751,6 +774,8 @@ void PoolSizeChangeThread(void* v); void DbVerificationThread(void* v); +void RemoteCompactionWorkerThread(void* v); + void CompressedCacheSetCapacityThread(void* v); void TimestampedSnapshotsThread(void* v); @@ -797,5 +822,10 @@ Status SaveFilesInDirectory(const std::string& src_dirname, const std::string& dst_dirname); Status DestroyUnverifiedSubdir(const std::string& dirname); Status InitUnverifiedSubdir(const std::string& dirname); + +// Destroy the DB at the given path under the env configured for db_stress. +// Handles both regular DB and BlobDB, and cleans and removes the entire dir. +Status DbStressDestroyDb(const std::string& db_path); + } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_compaction_service.cc b/db_stress_tool/db_stress_compaction_service.cc new file mode 100644 index 000000000000..b64fe56095e6 --- /dev/null +++ b/db_stress_tool/db_stress_compaction_service.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#include "db_stress_tool/db_stress_compaction_service.h" + +#include + +#include "db_stress_tool/db_stress_test_base.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +CompactionServiceJobStatus DbStressCompactionService::Wait( + const std::string& scheduled_job_id, std::string* result) { + while (true) { + if (aborted_.load()) { + return CompactionServiceJobStatus::kAborted; + } + const auto& maybeResultStatus = + shared_->GetRemoteCompactionResult(scheduled_job_id, result); + if (maybeResultStatus.has_value()) { + auto s = maybeResultStatus.value(); + if (s.ok()) { + assert(result); + assert(!result->empty()); + return CompactionServiceJobStatus::kSuccess; + } else { + // Remote Compaction failed + if (failure_should_fall_back_to_local_) { + return CompactionServiceJobStatus::kUseLocal; + } + if (StressTest::IsErrorInjectedAndRetryable(s)) { + return CompactionServiceJobStatus::kUseLocal; + } + if (result && result->empty()) { + // If result is empty, set the compaction status in the result so + // that it can be bubbled up to main thread + CompactionServiceResult compaction_result; + compaction_result.status = s; + if (compaction_result.Write(result).ok()) { + assert(result); + assert(!result->empty()); + } + } + return CompactionServiceJobStatus::kFailure; + } + } else { + // Remote Compaction is still running + Env::Default()->SleepForMicroseconds(kWaitIntervalInMicros); + } + } + return CompactionServiceJobStatus::kFailure; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff --git a/db_stress_tool/db_stress_compaction_service.h b/db_stress_tool/db_stress_compaction_service.h index f1fc04ea4467..a3566cef52a2 100644 --- a/db_stress_tool/db_stress_compaction_service.h +++ b/db_stress_tool/db_stress_compaction_service.h @@ -3,37 +3,91 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#ifdef GFLAGS #pragma once +#include "db/compaction/compaction_job.h" +#include "db_stress_shared_state.h" #include "rocksdb/options.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { // Service to simulate Remote Compaction in Stress Test class DbStressCompactionService : public CompactionService { public: - explicit DbStressCompactionService() {} + explicit DbStressCompactionService(SharedState* shared, + bool failure_should_fall_back_to_local) + : shared_(shared), + aborted_(false), + failure_should_fall_back_to_local_(failure_should_fall_back_to_local) {} static const char* kClassName() { return "DbStressCompactionService"; } const char* Name() const override { return kClassName(); } + static constexpr uint64_t kWaitIntervalInMicros = 10 * 1000; // 10ms + + static constexpr const char* kTempOutputDirectoryPrefix = "tmp_output_"; + CompactionServiceScheduleResponse Schedule( - const CompactionServiceJobInfo& /*info*/, - const std::string& /*compaction_service_input*/) override { + const CompactionServiceJobInfo& info, + const std::string& compaction_service_input) override { + std::string job_id = info.db_id + "_" + info.db_session_id + "_" + + std::to_string(info.job_id); + + if (aborted_.load()) { + return CompactionServiceScheduleResponse( + job_id, CompactionServiceJobStatus::kUseLocal); + } + std::string output_directory = info.db_name + "/" + + kTempOutputDirectoryPrefix + + Env::Default()->GenerateUniqueId(); + + shared_->EnqueueRemoteCompaction( + job_id, info, compaction_service_input, output_directory, + false /* was_cancelled */); // Not canceled initially CompactionServiceScheduleResponse response( - "Implement Me", CompactionServiceJobStatus::kUseLocal); + job_id, CompactionServiceJobStatus::kSuccess); return response; } - CompactionServiceJobStatus Wait(const std::string& /*scheduled_job_id*/, - std::string* /*result*/) override { - // TODO - Implement - return CompactionServiceJobStatus::kUseLocal; + CompactionServiceJobStatus Wait(const std::string& scheduled_job_id, + std::string* result) override; + + void OnInstallation(const std::string& scheduled_job_id, + CompactionServiceJobStatus /*status*/) override { + // Clean up tmp directory + std::string serialized; + CompactionServiceResult result; + if (shared_->GetRemoteCompactionResult(scheduled_job_id, &serialized) + .has_value()) { + if (CompactionServiceResult::Read(serialized, &result).ok()) { + std::vector filenames; + Status s = Env::Default()->GetChildren(result.output_path, &filenames); + for (size_t i = 0; s.ok() && i < filenames.size(); ++i) { + s = Env::Default()->DeleteFile(result.output_path + "/" + + filenames[i]); + if (!s.ok()) { + // TODO - Handle clean up failure? + break; + } + } + if (s.ok()) { + Env::Default()->DeleteDir(result.output_path).PermitUncheckedError(); + } + } + shared_->RemoveRemoteCompactionResult(scheduled_job_id); + } } - // TODO - Implement - void CancelAwaitingJobs() override {} -}; + void CancelAwaitingJobs() override { aborted_.store(true); } + private: + SharedState* shared_; + std::atomic_bool aborted_{false}; + bool failure_should_fall_back_to_local_; +}; } // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff --git a/db_stress_tool/db_stress_compression_manager.cc b/db_stress_tool/db_stress_compression_manager.cc new file mode 100644 index 000000000000..9746c490333f --- /dev/null +++ b/db_stress_tool/db_stress_compression_manager.cc @@ -0,0 +1,28 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "db_stress_compression_manager.h" + +#include "rocksdb/utilities/object_registry.h" + +namespace ROCKSDB_NAMESPACE { +void DbStressCustomCompressionManager::Register() { + // We must register any compression managers with a custom + // CompatibilityName() so that if it was used in a past invocation but not + // the current invocation, we can still read the SST files requiring it. + static std::once_flag loaded; + std::call_once(loaded, [&]() { + TEST_AllowUnsupportedFormatVersion() = true; + auto& library = *ObjectLibrary::Default(); + library.AddFactory( + DbStressCustomCompressionManager().CompatibilityName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /*errmsg*/) { + *guard = std::make_unique(); + return guard->get(); + }); + }); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/db_stress_compression_manager.h b/db_stress_tool/db_stress_compression_manager.h new file mode 100644 index 000000000000..8438a6583c7d --- /dev/null +++ b/db_stress_tool/db_stress_compression_manager.h @@ -0,0 +1,67 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +class DbStressCustomCompressionManager : public CompressionManager { + public: + const char* Name() const override { + return "DbStressCustomCompressionManager"; + } + const char* CompatibilityName() const override { return "DbStressCustom1"; } + + bool SupportsCompressionType(CompressionType type) const override { + return default_->SupportsCompressionType(type) || + type == kCustomCompressionAA || type == kCustomCompressionAB || + type == kCustomCompressionAC; + } + + std::unique_ptr GetCompressor(const CompressionOptions& opts, + CompressionType type) override { + // db_stress never specifies a custom type, so we randomly use them anyway + // when this compression manager is used. + std::array choices = { + type, kCustomCompressionAA, kCustomCompressionAB, kCustomCompressionAC}; + type = choices[Random::GetTLSInstance()->Uniform(4)]; + switch (static_cast(type)) { + case kCustomCompressionAA: + return std::make_unique< + test::CompressorCustomAlg>(); + case kCustomCompressionAB: + return std::make_unique< + test::CompressorCustomAlg>(); + case kCustomCompressionAC: + return std::make_unique< + test::CompressorCustomAlg>(); + // Also support built-in compression algorithms + default: + return GetBuiltinV2CompressionManager()->GetCompressor(opts, type); + } + } + + std::shared_ptr GetDecompressor() override { + return std::make_shared(); + } + + std::shared_ptr GetDecompressorForTypes( + const CompressionType* types_begin, + const CompressionType* types_end) override { + auto decomp = std::make_shared(); + decomp->SetAllowedTypes(types_begin, types_end); + return decomp; + } + + static void Register(); + + protected: + std::shared_ptr default_ = + GetBuiltinV2CompressionManager(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index d5fb3e643652..aa93de97ec4a 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -102,6 +102,14 @@ bool RunStressTestImpl(SharedState* shared) { shared->IncBgThreads(); } + uint32_t remote_compaction_worker_thread_count = + FLAGS_remote_compaction_worker_threads; + if (remote_compaction_worker_thread_count > 0) { + for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) { + shared->IncBgThreads(); + } + } + std::vector threads(n); for (uint32_t i = 0; i < n; i++) { threads[i] = new ThreadState(i, shared); @@ -126,6 +134,17 @@ bool RunStressTestImpl(SharedState* shared) { &compressed_cache_set_capacity_thread); } + std::vector remote_compaction_worker_threads; + if (remote_compaction_worker_thread_count > 0) { + remote_compaction_worker_threads.reserve( + remote_compaction_worker_thread_count); + for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) { + ThreadState* ts = new ThreadState(i, shared); + remote_compaction_worker_threads.push_back(ts); + db_stress_env->StartThread(RemoteCompactionWorkerThread, ts); + } + } + // Each thread goes through the following states: // initializing -> wait for others to init -> read/populate/depopulate // wait for others to operate -> verify -> done @@ -218,6 +237,7 @@ bool RunStressTestImpl(SharedState* shared) { delete threads[i]; threads[i] = nullptr; } + now = clock->NowMicros(); if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots && !shared->HasVerificationFailedYet()) { @@ -232,7 +252,8 @@ bool RunStressTestImpl(SharedState* shared) { if (FLAGS_compaction_thread_pool_adjust_interval > 0 || FLAGS_continuous_verification_interval > 0 || FLAGS_compressed_secondary_cache_size > 0 || - FLAGS_compressed_secondary_cache_ratio > 0.0) { + FLAGS_compressed_secondary_cache_ratio > 0.0 || + remote_compaction_worker_thread_count > 0) { MutexLock l(shared->GetMutex()); shared->SetShouldStopBgThread(); while (!shared->BgThreadsFinished()) { @@ -240,6 +261,15 @@ bool RunStressTestImpl(SharedState* shared) { } } + assert(remote_compaction_worker_threads.size() == + remote_compaction_worker_thread_count); + if (remote_compaction_worker_thread_count > 0) { + for (ThreadState* thread_state : remote_compaction_worker_threads) { + delete thread_state; + } + remote_compaction_worker_threads.clear(); + } + if (shared->HasVerificationFailedYet()) { fprintf(stderr, "Verification failed :(\n"); return false; diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index 5ea9e8b6ef1c..4186bc41f653 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -9,8 +9,11 @@ #ifdef GFLAGS #pragma once + #include "db_stress_tool/db_stress_common.h" +#include "file/filename.h" #include "monitoring/thread_status_util.h" +#include "rocksdb/file_checksum.h" namespace ROCKSDB_NAMESPACE { namespace { @@ -173,6 +176,35 @@ class DbStressFSWrapper : public FileSystemWrapper { const FileOptions& file_opts, std::unique_ptr* r, IODebugContext* dbg) override { + // verify that file checksums are propagated through FileOptions + // for SST file opens. + + std::string basename = f.substr(f.rfind('/') + 1); + uint64_t file_number; + FileType file_type; + if (ParseFileName(basename, &file_number, &file_type) && + file_type == kTableFile) { + // file_checksum_func_name must always be populated to be sure each call + // site within RocksDB is intentional about populating the fields with the + // best available information: + // - kNoFileChecksumFuncName: no checksum context available + // (e.g., SstFileDumper, SstFileReader, checksum generation), + // always paired with empty checksum + // - kUnknownFileChecksumFuncName: file created without a + // checksum factory (from MANIFEST), always paired with + // empty checksum + // - a real name (e.g., "FileChecksumCrc32c"): checksum exists + assert(!file_opts.file_checksum_func_name.empty()); + if (file_opts.file_checksum_func_name == kUnknownFileChecksumFuncName || + file_opts.file_checksum_func_name == kNoFileChecksumFuncName) { + // No checksum available — checksum value must be empty + assert(file_opts.file_checksum.empty()); + } else { + // A real checksum function — checksum value must be present + assert(!file_opts.file_checksum.empty()); + } + } + std::unique_ptr file; IOStatus s = target()->NewRandomAccessFile(f, file_opts, &file, dbg); if (s.ok()) { diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 94028f07b40c..19b4c602e7c3 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -135,6 +135,14 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings"); DEFINE_bool(destroy_db_initially, true, "Destroys the database dir before start if this is true"); +DEFINE_bool(destroy_db_and_exit, false, + "Destroys the database dir and exits. Useful for cleanup without " + "running stress test. Other options are mostly ignored."); + +DEFINE_string(delete_dir_and_exit, "", + "Recursively deletes the specified directory and exits. " + "Useful for cleaning up TEST_TMPDIR after crash tests."); + DEFINE_bool(verbose, false, "Verbose"); DEFINE_bool(progress_reports, true, @@ -168,20 +176,6 @@ DEFINE_int32(min_write_buffer_number_to_merge, "writing less data to storage if there are duplicate records in" " each of these individual write buffers."); -DEFINE_int32(max_write_buffer_number_to_maintain, - ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain, - "The total maximum number of write buffers to maintain in memory " - "including copies of buffers that have already been flushed. " - "Unlike max_write_buffer_number, this parameter does not affect " - "flushing. This controls the minimum amount of write history " - "that will be available in memory for conflict checking when " - "Transactions are used. If this value is too low, some " - "transactions may fail at commit time due to not being able to " - "determine whether there were any write conflicts. Setting this " - "value to 0 will cause write buffers to be freed immediately " - "after they are flushed. If this value is set to -1, " - "'max_write_buffer_number' will be used."); - DEFINE_int64(max_write_buffer_size_to_maintain, ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain, "The total maximum size of write buffers to maintain in memory " @@ -421,6 +415,17 @@ DEFINE_bool(fifo_allow_compaction, false, "If true, set `Options::compaction_options_fifo.allow_compaction = " "true`. It only take effect when FIFO compaction is used."); +DEFINE_uint64(fifo_compaction_max_data_files_size_mb, 0, + "If non-zero, set " + "`Options::compaction_options_fifo.max_data_files_size` to this " + "value (in MB). Only takes effect with FIFO compaction."); + +DEFINE_bool(fifo_compaction_use_kv_ratio_compaction, false, + "If true, set " + "`Options::compaction_options_fifo.use_kv_ratio_compaction = " + "true`. Requires fifo_allow_compaction and " + "fifo_compaction_max_data_files_size_mb > 0."); + DEFINE_bool(allow_concurrent_memtable_write, false, "Allow multi-writers to update mem tables in parallel."); @@ -435,17 +440,6 @@ DEFINE_bool(enable_write_thread_adaptive_yield, // Options for StackableDB-based BlobDB DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB."); -DEFINE_uint64( - blob_db_min_blob_size, - ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size, - "[Stacked BlobDB] Smallest blob to store in a file. Blobs " - "smaller than this will be inlined with the key in the LSM tree."); - -DEFINE_uint64( - blob_db_bytes_per_sync, - ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync, - "[Stacked BlobDB] Sync blob files once per every N bytes written."); - DEFINE_uint64(blob_db_file_size, ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size, "[Stacked BlobDB] Target size of each blob file."); @@ -455,11 +449,6 @@ DEFINE_bool( ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection, "[Stacked BlobDB] Enable BlobDB garbage collection."); -DEFINE_double( - blob_db_gc_cutoff, - ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff, - "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection."); - // Options for integrated BlobDB DEFINE_bool(allow_setting_blob_options_dynamically, false, "[Integrated BlobDB] Allow setting blob options dynamically."); @@ -481,7 +470,9 @@ DEFINE_uint64(blob_file_size, DEFINE_string(blob_compression_type, "none", "[Integrated BlobDB] The compression algorithm to use for large " "values stored in blob files."); - +DEFINE_string(compression_manager, "mixed", + "Ability to change compression manager specified in " + "simple_mixed_manager.h (mixed -> roundRobin)"); DEFINE_bool(enable_blob_garbage_collection, ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() .enable_blob_garbage_collection, @@ -550,6 +541,9 @@ DEFINE_string(file_temperature_age_thresholds, "", "See CompactionOptionsFIFO::file_temperature_age_thresholds. " "empty == unset"); +DEFINE_bool(allow_trivial_copy_when_change_temperature, true, + "Allow kChangeTemperature to do trivial copy"); + static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); @@ -613,6 +607,12 @@ DEFINE_int32( ROCKSDB_NAMESPACE::BlockBasedTableOptions().data_block_index_type), "Index type for data blocks (see `enum DataBlockIndexType` in table.h)"); +DEFINE_int32(index_block_search_type, + static_cast(ROCKSDB_NAMESPACE::BlockBasedTableOptions() + .index_block_search_type), + "Search algorithm for index blocks (see `enum BlockSearchType` in " + "table.h)"); + DEFINE_string(db, "", "Use the db with the following name."); DEFINE_string(secondaries_base, "", @@ -733,6 +733,10 @@ DEFINE_uint64(txn_write_policy, 0, "TxnDBWritePolicy::WRITE_COMMITTED. Note that this should not be " "changed across crashes."); +DEFINE_bool(use_per_key_point_lock_mgr, true, + "Use PointLockManager(false) or PerKeyPointLockManager(true) in " + "TransactionDB."); + DEFINE_bool(use_optimistic_txn, false, "Use OptimisticTransactionDB."); DEFINE_uint64(occ_validation_policy, 1, "Optimistic Concurrency Control Validation Policy for " @@ -813,6 +817,10 @@ DEFINE_int32( "If non-zero, then DisableManualCompaction()+Enable will be called " "once for every N ops on average. 0 disables."); +DEFINE_int32(abort_and_resume_compactions_one_in, 0, + "If non-zero, then AbortAllCompactions()+Resume will be called " + "once for every N ops on average. 0 disables."); + DEFINE_int32(compact_range_width, 10000, "The width of the ranges passed to CompactRange()."); @@ -853,8 +861,28 @@ DEFINE_bool(track_and_verify_wals, ROCKSDB_NAMESPACE::Options().track_and_verify_wals, "See Options::track_and_verify_wals"); -DEFINE_bool(enable_remote_compaction, false, - "Enable (simulated) Remote Compaction"); +DEFINE_int32( + remote_compaction_worker_threads, 2, + "Remote Compaction Worker Thread count. If 0, remote compaction is " + "disabled"); + +DEFINE_int32(remote_compaction_worker_interval, 10, + "Remote Compaction Worker Thread dequeue tasks every N " + "milliseconds. (Default: 10ms)"); + +DEFINE_bool(remote_compaction_failure_fall_back_to_local, true, + "If true, remote compaction failures will be ignored and " + "compactions will fall back to local and retried"); + +DEFINE_int32(allow_resumption_one_in, 0, + "If non-zero, enable resumable compaction with 1/N probability " + "for each OpenAndCompact call.Requires " + "remote_compaction_worker_threads > 0"); + +DEFINE_uint32(ingest_wbwi_one_in, 0, + "If set, will call" + "IngestWriteBatchWithIndex() instead of regular write operations " + "once every N writes."); static bool ValidateInt32Percent(const char* flagname, int32_t value) { if (value < 0 || value > 100) { @@ -963,7 +991,11 @@ DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock"); static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range); -DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file"); +DEFINE_uint64(max_manifest_file_size, 16384, + "Maximum size of a MANIFEST file (without auto-tuning)"); + +DEFINE_int32(max_manifest_space_amp_pct, 500, + "Max manifest space amp percentage for auto-tuning"); DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable"); @@ -1098,10 +1130,6 @@ DEFINE_bool(paranoid_file_checks, true, "After writing every SST file, reopen it and read all the keys " "and validate checksums"); -DEFINE_bool(fail_if_options_file_error, false, - "Fail operations that fail to detect or properly persist options " - "file."); - DEFINE_uint64(batch_protection_bytes_per_key, 0, "If nonzero, enables integrity protection in `WriteBatch` at the " "specified number of bytes per key. Currently the only supported " @@ -1379,12 +1407,6 @@ DEFINE_bool(use_adaptive_mutex_lru, ROCKSDB_NAMESPACE::LRUCacheOptions().use_adaptive_mutex, "LRUCacheOptions.use_adaptive_mutex"); -DEFINE_uint32( - compress_format_version, - static_cast(ROCKSDB_NAMESPACE::CompressedSecondaryCacheOptions() - .compress_format_version), - "CompressedSecondaryCacheOptions.compress_format_version"); - DEFINE_uint64(manifest_preallocation_size, ROCKSDB_NAMESPACE::Options().manifest_preallocation_size, "Options.manifest_preallocation_size"); @@ -1426,6 +1448,17 @@ DEFINE_bool(block_align, ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align, "BlockBasedTableOptions.block_align"); +DEFINE_uint64( + super_block_alignment_size, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size, + "BlockBasedTableOptions.super_block_alignment_size"); + +DEFINE_uint64( + super_block_alignment_space_overhead_ratio, + ROCKSDB_NAMESPACE::BlockBasedTableOptions() + .super_block_alignment_space_overhead_ratio, + "BlockBasedTableOptions.super_block_alignment_space_overhead_ratio"); + DEFINE_uint32( lowest_used_cache_tier, static_cast(ROCKSDB_NAMESPACE::Options().lowest_used_cache_tier), @@ -1478,13 +1511,63 @@ DEFINE_bool(paranoid_memory_checks, ROCKSDB_NAMESPACE::Options().paranoid_memory_checks, "Sets CF option paranoid_memory_checks."); +DEFINE_bool( + memtable_veirfy_per_key_checksum_on_seek, + ROCKSDB_NAMESPACE::Options().memtable_veirfy_per_key_checksum_on_seek, + "Sets CF option memtable_veirfy_per_key_checksum_on_seek."); + DEFINE_uint32(commit_bypass_memtable_one_in, 0, "If greater than zero, transaction option will set " "commit_bypass_memtable to per every N transactions on average."); +// Compaction on deletion trigger flags +DEFINE_bool(enable_compaction_on_deletion_trigger, false, + "Enable CompactOnDeletionCollectorFactory for stress testing " + "deletion-triggered compaction scenarios."); + +DEFINE_uint64(compaction_on_deletion_min_file_size, 32 * 1024, + "Minimum file size (in bytes) for deletion-triggered compaction. " + "Files smaller than this will not trigger compaction even if " + "deletion ratio is exceeded. Default: 32KB"); + +DEFINE_int32(compaction_on_deletion_trigger_count, 50, + "Number of deletions that triggers compaction when deletion " + "ratio is exceeded. Default: 50"); + +DEFINE_int32(compaction_on_deletion_window_size, 100, + "Size of the sliding window for tracking deletions. " + "Default: 100"); + +DEFINE_double(compaction_on_deletion_ratio, 0.5, + "Deletion ratio threshold for triggering compaction. " + "Default: 0.5 (50%)"); + DEFINE_bool( auto_refresh_iterator_with_snapshot, ROCKSDB_NAMESPACE::ReadOptions().auto_refresh_iterator_with_snapshot, "ReadOptions.auto_refresh_iterator_with_snapshot"); +DEFINE_uint32( + memtable_op_scan_flush_trigger, + ROCKSDB_NAMESPACE::ColumnFamilyOptions().memtable_op_scan_flush_trigger, + "Sets CF option memtable_op_scan_flush_trigger."); + +DEFINE_uint32( + memtable_avg_op_scan_flush_trigger, + ROCKSDB_NAMESPACE::ColumnFamilyOptions().memtable_avg_op_scan_flush_trigger, + "Sets CF option memtable_avg_op_scan_flush_trigger."); + +DEFINE_bool( + universal_reduce_file_locking, + ROCKSDB_NAMESPACE::ColumnFamilyOptions() + .compaction_options_universal.reduce_file_locking, + "Sets " + "ColumnFamilyOptions().compaciton_options_universal.reduce_file_locking."); + +DEFINE_bool(use_multiscan, false, + "If set, use the batched MultiScan API for scans."); + +DEFINE_bool(multiscan_use_async_io, false, + "If set, enable async_io for MultiScan operations."); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_listener.h b/db_stress_tool/db_stress_listener.h index 35c70b5a1036..fd28d5b4ced0 100644 --- a/db_stress_tool/db_stress_listener.h +++ b/db_stress_tool/db_stress_listener.h @@ -9,6 +9,7 @@ #include #include +#include "db_stress_tool/db_stress_compaction_service.h" #include "db_stress_tool/db_stress_shared_state.h" #include "file/filename.h" #include "file/writable_file_writer.h" @@ -21,7 +22,6 @@ #include "util/gflags_compat.h" #include "util/random.h" #include "utilities/fault_injection_fs.h" - DECLARE_int32(compact_files_one_in); extern std::shared_ptr fault_fs_guard; @@ -265,7 +265,7 @@ class DbStressListener : public EventListener { fault_fs_guard->DisableAllThreadLocalErrorInjection(); // TODO(hx235): only exempt the flush thread during error recovery instead // of all the flush threads from error injection - fault_fs_guard->SetIOActivtiesExcludedFromFaultInjection( + fault_fs_guard->SetIOActivitiesExcludedFromFaultInjection( {Env::IOActivity::kFlush}); } } @@ -275,7 +275,7 @@ class DbStressListener : public EventListener { RandomSleep(); if (FLAGS_error_recovery_with_no_fault_injection && fault_fs_guard) { fault_fs_guard->EnableAllThreadLocalErrorInjection(); - fault_fs_guard->SetIOActivtiesExcludedFromFaultInjection({}); + fault_fs_guard->SetIOActivitiesExcludedFromFaultInjection({}); } } @@ -310,6 +310,11 @@ class DbStressListener : public EventListener { } } } + // We can't do exact matching since remote workers use dynamic temp paths + if (file_dir.find(DbStressCompactionService::kTempOutputDirectoryPrefix) != + std::string::npos) { + return; + } assert(false); #else (void)file_dir; diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index 5d9fb34ac10c..b4546cd3bad2 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -51,6 +51,24 @@ DECLARE_bool(enable_compaction_filter); namespace ROCKSDB_NAMESPACE { class StressTest; +struct RemoteCompactionQueueItem { + std::string job_id; + CompactionServiceJobInfo job_info; + std::string serialized_input; + std::string output_directory; + bool canceled; + + RemoteCompactionQueueItem(const std::string& id, + const CompactionServiceJobInfo& info, + const std::string& input, + const std::string& output_dir, bool was_canceled) + : job_id(id), + job_info(info), + serialized_input(input), + output_directory(output_dir), + canceled(was_canceled) {} +}; + // State shared by all concurrent executions of the same benchmark. class SharedState { public: @@ -137,7 +155,7 @@ class SharedState { for (int i = 0; i < FLAGS_column_families; ++i) { key_locks_[i].reset(new port::Mutex[num_locks]); } - if (FLAGS_read_fault_one_in) { + if (FLAGS_read_fault_one_in || FLAGS_metadata_read_fault_one_in) { #ifdef NDEBUG // Unsupported in release mode because it relies on // `IGNORE_STATUS_IF_ERROR` to distinguish faults not expected to lead to @@ -276,6 +294,64 @@ class SharedState { return expected_state_manager_->GetPersistedSeqno(); } + void EnqueueRemoteCompaction(const std::string& job_id, + const CompactionServiceJobInfo& job_info, + const std::string& serialized_input, + const std::string& output_directory, + bool canceled) { + MutexLock l(&remote_compaction_queue_mu_); + remote_compaction_queue_.emplace(job_id, job_info, serialized_input, + output_directory, canceled); + } + + bool DequeueRemoteCompaction(std::string* job_id, + CompactionServiceJobInfo* job_info, + std::string* serialized_input, + std::string* output_directory, bool* canceled) { + assert(job_id); + assert(job_info); + assert(serialized_input); + assert(output_directory); + assert(canceled); + MutexLock l(&remote_compaction_queue_mu_); + if (!remote_compaction_queue_.empty()) { + const RemoteCompactionQueueItem& item = remote_compaction_queue_.front(); + *job_id = item.job_id; + *job_info = item.job_info; + *serialized_input = item.serialized_input; + *output_directory = item.output_directory; + *canceled = item.canceled; + remote_compaction_queue_.pop(); + return true; + } + return false; + } + + void AddRemoteCompactionResult(const std::string& job_id, + const Status& status, + const std::string& result) { + MutexLock l(&remote_compaction_result_map_mu_); + remote_compaction_result_map_.emplace( + job_id, std::pair{status, result}); + } + + std::optional GetRemoteCompactionResult(const std::string& job_id, + std::string* result) { + MutexLock l(&remote_compaction_result_map_mu_); + if (remote_compaction_result_map_.find(job_id) != + remote_compaction_result_map_.end()) { + const auto& pair = remote_compaction_result_map_.at(job_id); + *result = pair.second; + return pair.first; + } + return std::nullopt; + } + + void RemoveRemoteCompactionResult(const std::string& job_id) { + MutexLock l(&remote_compaction_result_map_mu_); + remote_compaction_result_map_.erase(job_id); + } + // Prepare a Put that will be started but not finish yet // This is useful for crash-recovery testing when the process may crash // before updating the corresponding expected value @@ -430,6 +506,15 @@ class SharedState { std::atomic verification_failure_; std::atomic should_stop_test_; + // Queue for the remote compaction. + port::Mutex remote_compaction_queue_mu_; + std::queue remote_compaction_queue_; + // Result Map for the remote compaciton. Key is the scheduled_job_id and value + // is serialized compaction_service_result + port::Mutex remote_compaction_result_map_mu_; + std::unordered_map> + remote_compaction_result_map_; + // Keys that should not be overwritten const std::unordered_set no_overwrite_ids_; diff --git a/db_stress_tool/db_stress_stat.cc b/db_stress_tool/db_stress_stat.cc deleted file mode 100644 index 6a7883a52ac7..000000000000 --- a/db_stress_tool/db_stress_stat.cc +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#ifdef GFLAGS - -#include "db_stress_tool/db_stress_stat.h" - -namespace ROCKSDB_NAMESPACE { - -std::shared_ptr dbstats; -std::shared_ptr dbstats_secondaries; - -} // namespace ROCKSDB_NAMESPACE - -#endif // GFLAGS diff --git a/db_stress_tool/db_stress_stat.h b/db_stress_tool/db_stress_stat.h index 5b38c6e2bb5d..e4a8a8fb5999 100644 --- a/db_stress_tool/db_stress_stat.h +++ b/db_stress_tool/db_stress_stat.h @@ -22,10 +22,6 @@ DECLARE_bool(progress_reports); namespace ROCKSDB_NAMESPACE { -// Database statistics -extern std::shared_ptr dbstats; -extern std::shared_ptr dbstats_secondaries; - class Stats { private: uint64_t start_; diff --git a/db_stress_tool/db_stress_table_properties_collector.h b/db_stress_tool/db_stress_table_properties_collector.h index 4723f6fc5d2f..b3f76e446436 100644 --- a/db_stress_tool/db_stress_table_properties_collector.h +++ b/db_stress_tool/db_stress_table_properties_collector.h @@ -26,25 +26,50 @@ class DbStressTablePropertiesCollector : public TablePropertiesCollector { Status AddUserKey(const Slice& /* key */, const Slice& /* value */, EntryType /*type*/, SequenceNumber /*seq*/, uint64_t /*file_size*/) override { + ++keys_added; + ++all_calls; return Status::OK(); } - Status Finish(UserCollectedProperties* /* properties */) override { + void BlockAdd(uint64_t /* block_uncomp_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { + ++blocks_added; + ++all_calls; + } + + Status Finish(UserCollectedProperties* properties) override { + ++all_calls; + (*properties)["db_stress_collector_property"] = + std::to_string(keys_added) + ";" + std::to_string(blocks_added) + ";" + + std::to_string(all_calls); return Status::OK(); } UserCollectedProperties GetReadableProperties() const override { - return UserCollectedProperties{}; + UserCollectedProperties props; + const_cast(this)->Finish(&props); + return props; } const char* Name() const override { return "DbStressTablePropertiesCollector"; } - bool NeedCompact() const override { return need_compact_; } + bool NeedCompact() const override { + ++all_calls; + return need_compact_; + } private: const bool need_compact_; + // These are tracked to detect race conditions that would arise from RocksDB + // invoking TablePropertiesCollector functions in an unsynchronized way, as + // TablePropertiesCollectors are allowed (encouraged) not to be thread safe. + size_t keys_added = 0; + size_t blocks_added = 0; + // Including race between BlockAdd and AddUserKey (etc.) + mutable size_t all_calls = 0; }; // A `DbStressTablePropertiesCollectorFactory` creates diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 8403ee3e9c4b..a57199e2d226 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -11,6 +11,7 @@ #include #include +#include "db_stress_tool/db_stress_compression_manager.h" #include "db_stress_tool/db_stress_listener.h" #include "rocksdb/io_status.h" #include "rocksdb/options.h" @@ -24,16 +25,19 @@ #include "db_stress_tool/db_stress_filters.h" #include "db_stress_tool/db_stress_table_properties_collector.h" #include "db_stress_tool/db_stress_wide_merge_operator.h" +#include "file/file_util.h" #include "options/options_parser.h" #include "rocksdb/convenience.h" #include "rocksdb/filter_policy.h" #include "rocksdb/secondary_cache.h" #include "rocksdb/sst_file_manager.h" +#include "rocksdb/table_properties.h" #include "rocksdb/types.h" #include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "test_util/testutil.h" #include "util/cast_util.h" +#include "util/simple_mixed_compressor.h" #include "utilities/backup/backup_engine_impl.h" #include "utilities/fault_injection_fs.h" #include "utilities/fault_injection_secondary_cache.h" @@ -70,25 +74,9 @@ StressTest::StressTest() new_column_family_name_(1), num_times_reopened_(0), db_preload_finished_(false), - secondary_db_(nullptr), is_db_stopped_(false) { if (FLAGS_destroy_db_initially) { - std::vector files; - db_stress_env->GetChildren(FLAGS_db, &files); - for (unsigned int i = 0; i < files.size(); i++) { - if (Slice(files[i]).starts_with("heap-")) { - db_stress_env->DeleteFile(FLAGS_db + "/" + files[i]); - } - } - - Options options; - options.env = db_stress_env; - // Remove files without preserving manfiest files - const Status s = !FLAGS_use_blob_db - ? DestroyDB(FLAGS_db, options) - : blob_db::DestroyBlobDB(FLAGS_db, options, - blob_db::BlobDBOptions()); - + const Status s = DbStressDestroyDb(FLAGS_db); if (!s.ok()) { fprintf(stderr, "Cannot destroy original db: %s\n", s.ToString().c_str()); exit(1); @@ -109,11 +97,10 @@ void StressTest::CleanUp() { if (db_) { db_->Close(); } - delete db_; + db_owner_.reset(); db_ = nullptr; - delete secondary_db_; - secondary_db_ = nullptr; + secondary_db_.reset(); } void StressTest::CleanUpColumnFamilies() { @@ -163,7 +150,6 @@ std::shared_ptr StressTest::NewCache(size_t capacity, } CompressedSecondaryCacheOptions opts; opts.capacity = FLAGS_compressed_secondary_cache_size; - opts.compress_format_version = FLAGS_compress_format_version; if (FLAGS_enable_do_not_compress_roles) { opts.do_not_compress_roles = {CacheEntryRoleSet::All()}; } @@ -191,10 +177,10 @@ std::shared_ptr StressTest::NewCache(size_t capacity, exit(1); } else if (EndsWith(cache_type, "hyper_clock_cache")) { size_t estimated_entry_charge; - if (cache_type == "fixed_hyper_clock_cache" || - cache_type == "hyper_clock_cache") { + if (cache_type == "fixed_hyper_clock_cache") { estimated_entry_charge = FLAGS_block_size; - } else if (cache_type == "auto_hyper_clock_cache") { + } else if (cache_type == "auto_hyper_clock_cache" || + cache_type == "hyper_clock_cache") { estimated_entry_charge = 0; } else { fprintf(stderr, "Cache type not supported."); @@ -346,7 +332,6 @@ bool StressTest::BuildOptionsTable() { "1", "2", }}, - {"max_sequential_skip_in_iterations", {"4", "8", "12"}}, {"block_based_table_factory", { keepRibbonFilterPolicyOnly ? "{filter_policy=ribbonfilter:2.35}" @@ -359,6 +344,13 @@ bool StressTest::BuildOptionsTable() { std::to_string(FLAGS_block_size + (FLAGS_seed & 0xFFFU)) + "}", }}, }; + if (FLAGS_use_multiscan == 0) { + // TODO: this can fail MultiScan when consecutive data blocks share the + // same user at boundary. MultiScan uses user key to locate the block to + // reach which can move the scan earlier than its current block. + options_tbl.emplace("max_sequential_skip_in_iterations", + std::vector{"4", "8", "12"}); + } if (FLAGS_compaction_style == kCompactionStyleUniversal && FLAGS_universal_max_read_amp > 0) { // level0_file_num_compaction_trigger needs to be at most max_read_amp @@ -425,8 +417,16 @@ bool StressTest::BuildOptionsTable() { options_tbl.emplace( "file_temperature_age_thresholds", std::vector{ + "{{temperature=kWarm;age=10}:{temperature=kCool;age=30}:{" + "temperature=kCold;age=100}:{" + "temperature=kIce;age=300}}", "{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}", "{{temperature=kCold;age=100}}", "{}"}); + options_tbl.emplace( + "allow_trivial_copy_when_change_temperature", + std::vector{ + FLAGS_allow_trivial_copy_when_change_temperature ? "true" + : "false"}); } // NOTE: allow -1 to mean starting disabled but dynamically changing @@ -646,12 +646,20 @@ std::string StressTest::DebugString(const Slice& value, } void StressTest::PrintStatistics() { - if (dbstats) { - fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); + // Print statistics from the DB instance instead of global dbstats + if (db_) { + auto stats = db_->GetOptions().statistics; + if (stats) { + fprintf(stdout, "STATISTICS:\n%s\n", stats->ToString().c_str()); + } } - if (dbstats_secondaries) { - fprintf(stdout, "Secondary instances STATISTICS:\n%s\n", - dbstats_secondaries->ToString().c_str()); + // Print statistics from secondary DB instance if it exists + if (secondary_db_) { + auto stats = secondary_db_->GetOptions().statistics; + if (stats) { + fprintf(stdout, "Secondary instance STATISTICS:\n%s\n", + stats->ToString().c_str()); + } } } @@ -743,12 +751,11 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, } if (s.ok()) { CleanUpColumnFamilies(); - delete db_; + db_owner_.reset(); db_ = nullptr; txn_db_ = nullptr; optimistic_txn_db_ = nullptr; - delete secondary_db_; - secondary_db_ = nullptr; + secondary_db_.reset(); db_preload_finished_.store(true); auto now = clock_->NowMicros(); @@ -786,6 +793,12 @@ Status StressTest::SetOptions(ThreadState* thread) { return db_->SetOptions(cfh, opts); } +Options StressTest::GetOptions(int cf_id) { + auto cfh = column_families_[cf_id]; + assert(cfh); + return db_->GetOptions(cfh); +} + void StressTest::ProcessRecoveredPreparedTxns(SharedState* shared) { assert(txn_db_); std::vector recovered_prepared_trans; @@ -838,11 +851,21 @@ Status StressTest::NewTxn(WriteOptions& write_opts, ThreadState* thread, FLAGS_use_only_the_last_commit_time_batch_for_recovery; txn_options.lock_timeout = 600000; // 10 min txn_options.deadlock_detect = true; - if (FLAGS_commit_bypass_memtable_one_in > 0) { + if (FLAGS_commit_bypass_memtable_one_in > 0 && + thread->rand.OneIn(FLAGS_commit_bypass_memtable_one_in)) { assert(FLAGS_txn_write_policy == 0); assert(FLAGS_user_timestamp_size == 0); - txn_options.commit_bypass_memtable = - thread->rand.OneIn(FLAGS_commit_bypass_memtable_one_in); + if (thread->rand.OneIn(2)) { + txn_options.commit_bypass_memtable = true; + } + if (thread->rand.OneIn(2)) { + txn_options.large_txn_commit_optimize_threshold = 1; + } + if (thread->rand.OneIn(2) || + (!txn_options.commit_bypass_memtable && + txn_options.large_txn_commit_optimize_threshold != 1)) { + txn_options.large_txn_commit_optimize_byte_threshold = 1; + } if (commit_bypass_memtable) { *commit_bypass_memtable = txn_options.commit_bypass_memtable; } @@ -859,6 +882,10 @@ Status StressTest::CommitTxn(Transaction& txn, ThreadState* thread) { return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set"); } Status s = Status::OK(); + // We don't issue write to transaction's underlying WriteBatch in stress test + assert(txn.GetWriteBatch()->GetWriteBatch()->Count()); + assert(txn.GetWriteBatch()->GetWBWIOpCount() == + txn.GetWriteBatch()->GetWriteBatch()->Count()); if (FLAGS_use_optimistic_txn) { assert(optimistic_txn_db_); s = txn.Commit(); @@ -1240,6 +1267,11 @@ void StressTest::OperateDb(ThreadState* thread) { ProcessStatus(shared, "TestDisableManualCompaction", status); } + if (thread->rand.OneInOpt(FLAGS_abort_and_resume_compactions_one_in)) { + Status status = TestAbortAndResumeCompactions(thread); + ProcessStatus(shared, "TestAbortAndResumeCompactions", status); + } + if (thread->rand.OneInOpt(FLAGS_verify_checksum_one_in)) { ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking); ThreadStatusUtil::SetThreadOperation( @@ -1437,9 +1469,23 @@ void StressTest::OperateDb(ThreadState* thread) { } else if (prob_op < iterate_bound) { assert(delrange_bound <= prob_op); // OPERATION iterate - if (!FLAGS_skip_verifydb && - thread->rand.OneInOpt( - FLAGS_verify_iterator_with_expected_state_one_in)) { + if (FLAGS_use_multiscan) { + int num_seeks = static_cast( + std::min(static_cast(thread->rand.Uniform(64)), + static_cast(FLAGS_ops_per_thread - i - 1))); + // Generate 2x num_seeks random keys, as each scan has a start key + // and an upper bound + rand_keys = GenerateNKeys(thread, num_seeks * 2, i); + i += num_seeks - 1; + ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_DBITERATOR); + Status s; + s = TestMultiScan(thread, read_opts, rand_column_families, rand_keys); + ThreadStatusUtil::ResetThreadStatus(); + } else if (!FLAGS_skip_verifydb && + thread->rand.OneInOpt( + FLAGS_verify_iterator_with_expected_state_one_in)) { ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking); ThreadStatusUtil::SetThreadOperation( ThreadStatus::OperationType::OP_DBITERATOR); @@ -1617,6 +1663,184 @@ Status StressTest::TestIterateAttributeGroups( verify_func); } +Status StressTest::TestMultiScan(ThreadState* thread, + const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) { + size_t num_scans = rand_keys.size() / 2; + assert(!rand_column_families.empty()); + assert(!rand_keys.empty()); + + ThreadStatus::OperationType cur_op_type = + ThreadStatusUtil::GetThreadOperation(); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN); + ManagedSnapshot snapshot_guard(db_); + ThreadStatusUtil::SetThreadOperation(cur_op_type); + + ReadOptions ro = read_opts; + ro.snapshot = snapshot_guard.snapshot(); + + std::string read_ts_str; + Slice read_ts_slice; + MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice, ro); + + std::vector start_key_strs; + std::vector end_key_strs; + // TODO support reverse BytewiseComparator in the stress test + MultiScanArgs scan_opts(options_.comparator); + scan_opts.use_async_io = + FLAGS_multiscan_use_async_io && + CheckFSFeatureSupport(options_.env->GetFileSystem().get(), + FSSupportedOps::kAsyncIO); + start_key_strs.reserve(num_scans); + end_key_strs.reserve(num_scans); + + // Will be initialized before Seek() below. + Slice ub; + ro.iterate_upper_bound = &ub; + for (size_t i = 0; i < num_scans * 2; i += 2) { + assert(rand_keys[i] <= rand_keys[i + 1]); + start_key_strs.emplace_back(Key(rand_keys[i])); + end_key_strs.emplace_back(Key(rand_keys[i + 1])); + scan_opts.insert(Slice(start_key_strs.back()), Slice(end_key_strs.back())); + } + + std::string op_logs; + ro.pin_data = thread->rand.OneIn(2); + ro.background_purge_on_iterator_cleanup = thread->rand.OneIn(2); + + assert(options_.prefix_extractor.get() == nullptr); + + std::unique_ptr iter; + iter.reset(db_->NewIterator(ro, column_families_[rand_column_families[0]])); + iter->Prepare(scan_opts); + + constexpr size_t kOpLogsLimit = 50000; + + auto verify_func = [](Iterator* iterator) { + if (!VerifyWideColumns(iterator->value(), iterator->columns())) { + fprintf(stderr, + "Value and columns inconsistent for iterator: value: %s, " + "columns: %s\n", + iterator->value().ToString(/* hex */ true).c_str(), + WideColumnsToHex(iterator->columns()).c_str()); + return false; + } + return true; + }; + + for (const ScanOptions& scan_opt : scan_opts.GetScanRanges()) { + if (op_logs.size() > kOpLogsLimit) { + // Shouldn't take too much memory for the history log. Clear it. + op_logs = "(cleared...)\n"; + } + + // Set up an iterator, perform the same operations without bounds and with + // total order seek, and compare the results. This is to identify bugs + // related to bounds, prefix extractor, or reseeking. Sometimes we are + // comparing iterators with the same set-up, and it doesn't hurt to check + // them to be equal. + // + // This `ReadOptions` is for validation purposes. Ignore + // `FLAGS_rate_limit_user_ops` to avoid slowing any validation. + ReadOptions cmp_ro; + cmp_ro.timestamp = ro.timestamp; + cmp_ro.iter_start_ts = ro.iter_start_ts; + cmp_ro.snapshot = snapshot_guard.snapshot(); + cmp_ro.auto_refresh_iterator_with_snapshot = + ro.auto_refresh_iterator_with_snapshot; + cmp_ro.total_order_seek = true; + + ColumnFamilyHandle* const cmp_cfh = + GetControlCfh(thread, rand_column_families[0]); + assert(cmp_cfh); + + std::unique_ptr cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh)); + + bool diverged = false; + + assert(scan_opt.range.start); + assert(scan_opt.range.limit); + Slice key = scan_opt.range.start.value(); + ub = scan_opt.range.limit.value(); + + LastIterateOp last_op; + iter->Seek(key); + cmp_iter->Seek(key); + last_op = kLastOpSeek; + op_logs += "S " + key.ToString(true) + " "; + + if (iter->Valid() && ro.allow_unprepared_value) { + op_logs += "*"; + + if (!iter->PrepareValue()) { + assert(!iter->Valid()); + assert(!iter->status().ok()); + } + } + + if (!iter->status().ok() && IsErrorInjectedAndRetryable(iter->status())) { + return iter->status(); + } else if (!cmp_iter->status().ok() && + IsErrorInjectedAndRetryable(cmp_iter->status())) { + return cmp_iter->status(); + } + + VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op, + key, op_logs, verify_func, &diverged); + + while (iter->Valid()) { + iter->Next(); + if (!diverged) { + assert(cmp_iter->Valid()); + cmp_iter->Next(); + } + op_logs += "N"; + + if (iter->Valid() && ro.allow_unprepared_value) { + op_logs += "*"; + + if (!iter->PrepareValue()) { + assert(!iter->Valid()); + assert(!iter->status().ok()); + } + } + + if (!iter->status().ok() && IsErrorInjectedAndRetryable(iter->status())) { + return iter->status(); + } else if (!cmp_iter->status().ok() && + IsErrorInjectedAndRetryable(cmp_iter->status())) { + return cmp_iter->status(); + } + + VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op, + key, op_logs, verify_func, &diverged); + + if (diverged) { + if (thread->shared->HasVerificationFailedYet()) { + const std::vector& scanoptions = + scan_opts.GetScanRanges(); + for (const auto& t : scanoptions) { + fprintf(stdout, "Multiscan options: %s to %s \n", + t.range.start.value().ToString(true).c_str(), + t.range.limit.value().ToString(true).c_str()); + } + } + break; + } + } + + thread->stats.AddIterations(1); + + op_logs += "; "; + if (diverged) { + break; + } + } + + return Status::OK(); +} + template Status StressTest::TestIterateImpl(ThreadState* thread, const ReadOptions& read_opts, @@ -2279,7 +2503,7 @@ Status StressTest::TestBackupRestore( from = "BackupEngine::PurgeOldBackups"; } } - DB* restored_db = nullptr; + std::unique_ptr restored_db; std::vector restored_cf_handles; // Not yet implemented: opening restored BlobDB or TransactionDB @@ -2367,8 +2591,7 @@ Status StressTest::TestBackupRestore( for (auto* cf_handle : restored_cf_handles) { restored_db->DestroyColumnFamilyHandle(cf_handle); } - delete restored_db; - restored_db = nullptr; + restored_db.reset(); } if (s.ok() && inplace_not_restore) { // Purge late if inplace open read-only @@ -2603,7 +2826,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread, delete checkpoint; checkpoint = nullptr; std::vector cf_handles; - DB* checkpoint_db = nullptr; + std::unique_ptr checkpoint_db; if (s.ok()) { Options options(options_); options.best_efforts_recovery = false; @@ -2667,8 +2890,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread, delete cfh; } cf_handles.clear(); - delete checkpoint_db; - checkpoint_db = nullptr; + checkpoint_db.reset(); } // Temporarily disable error injection for clean-up @@ -2824,8 +3046,9 @@ void StressTest::TestCompactFiles(ThreadState* thread, // TOOD (hx235): allow an exact list of tolerable failures under stress // test bool non_ok_status_allowed = - s.IsManualCompactionPaused() || IsErrorInjectedAndRetryable(s) || - s.IsAborted() || s.IsInvalidArgument() || s.IsNotSupported(); + s.IsManualCompactionPaused() || s.IsCompactionAborted() || + IsErrorInjectedAndRetryable(s) || s.IsAborted() || + s.IsInvalidArgument() || s.IsNotSupported(); if (!non_ok_status_allowed) { fprintf(stderr, "Unable to perform CompactFiles(): %s under specified " @@ -2918,6 +3141,20 @@ Status StressTest::TestDisableManualCompaction(ThreadState* thread) { return Status::OK(); } +Status StressTest::TestAbortAndResumeCompactions(ThreadState* thread) { + // Abort all running compactions and prevent new ones from starting + db_->AbortAllCompactions(); + // Sleep to allow other threads to attempt operations while aborted + // Uses same sleep pattern as TestPauseBackground and + // TestDisableManualCompaction + int pwr2_micros = + std::min(thread->rand.Uniform(25), thread->rand.Uniform(25)); + clock_->SleepForMicroseconds(1 << pwr2_micros); + // Resume compactions + db_->ResumeAllCompactions(); + return Status::OK(); +} + void StressTest::TestAcquireSnapshot(ThreadState* thread, int rand_column_family, const std::string& keystr, uint64_t i) { @@ -3093,7 +3330,7 @@ void StressTest::TestCompactRange(ThreadState* thread, int64_t rand_key, if (!status.ok()) { // TOOD (hx235): allow an exact list of tolerable failures under stress test bool non_ok_status_allowed = - status.IsManualCompactionPaused() || + status.IsManualCompactionPaused() || status.IsCompactionAborted() || IsErrorInjectedAndRetryable(status) || status.IsAborted() || status.IsInvalidArgument() || status.IsNotSupported(); if (!non_ok_status_allowed) { @@ -3296,8 +3533,9 @@ void StressTest::PrintEnv() const { fprintf(stdout, "Verification only : %s\n", FLAGS_verification_only ? "true" : "false"); - const char* memtablerep = ""; + const char* memtablerep; switch (FLAGS_rep_factory) { + default: case kSkipList: memtablerep = "skip_list"; break; @@ -3376,8 +3614,6 @@ void StressTest::PrintEnv() const { FLAGS_sync_fault_injection); fprintf(stdout, "Best efforts recovery : %d\n", static_cast(FLAGS_best_efforts_recovery)); - fprintf(stdout, "Fail if OPTIONS file error: %d\n", - static_cast(FLAGS_fail_if_options_file_error)); fprintf(stdout, "User timestamp size bytes : %d\n", static_cast(FLAGS_user_timestamp_size)); fprintf(stdout, "Persist user defined timestamps : %d\n", @@ -3398,7 +3634,28 @@ void StressTest::Open(SharedState* shared, bool reopen) { InitializeOptionsFromFlags(cache_, filter_policy_, options_); } InitializeOptionsGeneral(cache_, filter_policy_, sqfc_factory_, options_); - + DbStressCustomCompressionManager::Register(); + + if (!strcasecmp(FLAGS_compression_manager.c_str(), "custom")) { + options_.compression_manager = + std::make_shared(); + } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) { + options_.compression_manager = + std::make_shared(GetBuiltinV2CompressionManager()); + } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "randommixed")) { + options_.compression_manager = + std::make_shared( + GetBuiltinV2CompressionManager()); + } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) { + options_.compression_manager = + CreateAutoSkipCompressionManager(GetBuiltinV2CompressionManager()); + } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) { + // Nothing to do using default compression manager + } else { + fprintf(stderr, "Unknown compression manager: %s\n", + FLAGS_compression_manager.c_str()); + exit(1); + } if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) { fprintf(stderr, "prefeix_size cannot be zero if memtablerep == prefix_hash\n"); @@ -3410,6 +3667,40 @@ void StressTest::Open(SharedState* shared, bool reopen) { "memtablerep != prefix_hash\n"); } + // Remote Compaction + if (FLAGS_remote_compaction_worker_threads > 0) { + // TODO(jaykorean) Remove this after fix - remote worker shouldn't recover + // from WAL + if (!FLAGS_disable_wal) { + fprintf(stderr, + "WAL is not compatible with Remote Compaction in Stress Test\n"); + exit(1); + } + if ((options_.enable_blob_files || + options_.enable_blob_garbage_collection || + FLAGS_allow_setting_blob_options_dynamically)) { + fprintf(stderr, + "Integrated BlobDB is currently incompatible with Remote " + "Compaction\n"); + exit(1); + } + // Each DB open/reopen gets a fresh compaction service instance with a clean + // aborted_ state + auto compaction_service = std::make_shared( + shared, FLAGS_remote_compaction_failure_fall_back_to_local); + + options_.compaction_service = compaction_service; + } + + if (FLAGS_allow_resumption_one_in > 0) { + if (FLAGS_remote_compaction_worker_threads == 0) { + fprintf(stderr, + "allow_resumption or randomize_allow_resumption requires " + "remote_compaction_worker_threads > 0\n"); + exit(1); + } + } + if ((options_.enable_blob_files || options_.enable_blob_garbage_collection || FLAGS_allow_setting_blob_options_dynamically) && FLAGS_best_efforts_recovery) { @@ -3567,26 +3858,28 @@ void StressTest::Open(SharedState* shared, bool reopen) { // StackableDB-based BlobDB if (FLAGS_use_blob_db) { blob_db::BlobDBOptions blob_db_options; - blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size; - blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync; blob_db_options.blob_file_size = FLAGS_blob_db_file_size; blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc; - blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff; blob_db::BlobDB* blob_db = nullptr; s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db, cf_descriptors, &column_families_, &blob_db); if (s.ok()) { + db_owner_.reset(blob_db); db_ = blob_db; } } else { if (db_preload_finished_.load() && FLAGS_read_only) { s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, - cf_descriptors, &column_families_, &db_); + cf_descriptors, &column_families_, + &db_owner_); } else { s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors, - &column_families_, &db_); + &column_families_, &db_owner_); + } + if (s.ok()) { + db_ = db_owner_.get(); } } @@ -3602,10 +3895,9 @@ void StressTest::Open(SharedState* shared, bool reopen) { s = db_->GetRootDB()->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { CleanUpColumnFamilies(); - delete db_; + db_owner_.reset(); db_ = nullptr; - delete secondary_db_; - secondary_db_ = nullptr; + secondary_db_.reset(); } } if (!s.ok()) { @@ -3662,6 +3954,7 @@ void StressTest::Open(SharedState* shared, bool reopen) { } assert(s.ok()); { + db_owner_.reset(optimistic_txn_db_); db_ = optimistic_txn_db_; db_aptr_.store(optimistic_txn_db_, std::memory_order_release); } @@ -3683,6 +3976,8 @@ void StressTest::Open(SharedState* shared, bool reopen) { static_cast(FLAGS_wp_snapshot_cache_bits); txn_db_options.wp_commit_cache_bits = static_cast(FLAGS_wp_commit_cache_bits); + txn_db_options.use_per_key_point_lock_mgr = + FLAGS_use_per_key_point_lock_mgr; PrepareTxnDbOptions(shared, txn_db_options); s = TransactionDB::Open(options_, txn_db_options, FLAGS_db, cf_descriptors, &column_families_, &txn_db_); @@ -3695,6 +3990,7 @@ void StressTest::Open(SharedState* shared, bool reopen) { // Do not swap the order of the following. { + db_owner_.reset(txn_db_); db_ = txn_db_; db_aptr_.store(txn_db_, std::memory_order_release); } @@ -3707,6 +4003,13 @@ void StressTest::Open(SharedState* shared, bool reopen) { assert(s.ok()); assert(column_families_.size() == static_cast(FLAGS_column_families)); + // Clear statistics reference from options_ to intentionally shorten the + // statistics object lifetime to be same as the db object (which is the + // common case in practice) and detect if RocksDB access the statistics + // beyond its lifetime. + if (FLAGS_statistics) { + options_.statistics.reset(); + } // Secondary instance does not support write-prepared/write-unprepared // transactions, thus just disable secondary instance if we use @@ -3726,6 +4029,7 @@ void StressTest::Open(SharedState* shared, bool reopen) { } else { DBWithTTL* db_with_ttl; s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl); + db_owner_.reset(db_with_ttl); db_ = db_with_ttl; } @@ -3805,12 +4109,11 @@ void StressTest::Reopen(ThreadState* thread) { } assert((txn_db_ == nullptr && optimistic_txn_db_ == nullptr) || (db_ == txn_db_ || db_ == optimistic_txn_db_)); - delete db_; + db_owner_.reset(); db_ = nullptr; txn_db_ = nullptr; optimistic_txn_db_ = nullptr; - delete secondary_db_; - secondary_db_ = nullptr; + secondary_db_.reset(); num_times_reopened_++; auto now = clock_->NowMicros(); @@ -4023,6 +4326,9 @@ void InitializeOptionsFromFlags( block_based_options.data_block_index_type = static_cast( FLAGS_data_block_index_type); + block_based_options.index_block_search_type = + static_cast( + FLAGS_index_block_search_type); block_based_options.prepopulate_block_cache = static_cast( FLAGS_prepopulate_block_cache); @@ -4041,14 +4347,16 @@ void InitializeOptionsFromFlags( static_cast( FLAGS_index_shortening); block_based_options.block_align = FLAGS_block_align; + block_based_options.super_block_alignment_size = + fLU64::FLAGS_super_block_alignment_size; + block_based_options.super_block_alignment_space_overhead_ratio = + fLU64::FLAGS_super_block_alignment_space_overhead_ratio; options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); options.db_write_buffer_size = FLAGS_db_write_buffer_size; options.write_buffer_size = FLAGS_write_buffer_size; options.max_write_buffer_number = FLAGS_max_write_buffer_number; options.min_write_buffer_number_to_merge = FLAGS_min_write_buffer_number_to_merge; - options.max_write_buffer_number_to_maintain = - FLAGS_max_write_buffer_number_to_maintain; options.max_write_buffer_size_to_maintain = FLAGS_max_write_buffer_size_to_maintain; options.memtable_prefix_bloom_size_ratio = @@ -4071,6 +4379,17 @@ void InitializeOptionsFromFlags( ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleFIFO) { options.compaction_options_fifo.allow_compaction = FLAGS_fifo_allow_compaction; + if (FLAGS_fifo_compaction_max_data_files_size_mb > 0) { + options.compaction_options_fifo.max_data_files_size = + FLAGS_fifo_compaction_max_data_files_size_mb * 1024 * 1024; + // max_table_files_size is ignored when max_data_files_size is non-zero, + // but validation requires max_data_files_size >= max_table_files_size. + options.compaction_options_fifo.max_table_files_size = + std::min(options.compaction_options_fifo.max_table_files_size, + options.compaction_options_fifo.max_data_files_size); + } + options.compaction_options_fifo.use_kv_ratio_compaction = + FLAGS_fifo_compaction_use_kv_ratio_compaction; } options.compaction_pri = static_cast(FLAGS_compaction_pri); @@ -4083,7 +4402,9 @@ void InitializeOptionsFromFlags( } } options.max_open_files = FLAGS_open_files; - options.statistics = dbstats; + if (FLAGS_statistics) { + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + } options.env = db_stress_env; options.use_fsync = FLAGS_use_fsync; options.compaction_readahead_size = FLAGS_compaction_readahead_size; @@ -4125,6 +4446,7 @@ void InitializeOptionsFromFlags( options.compression_opts.checksum = true; } options.max_manifest_file_size = FLAGS_max_manifest_file_size; + options.max_manifest_space_amp_pct = FLAGS_max_manifest_space_amp_pct; options.max_subcompactions = static_cast(FLAGS_subcompactions); options.allow_concurrent_memtable_write = FLAGS_allow_concurrent_memtable_write; @@ -4165,6 +4487,8 @@ void InitializeOptionsFromFlags( FLAGS_memtable_protection_bytes_per_key; options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; options.paranoid_memory_checks = FLAGS_paranoid_memory_checks; + options.memtable_veirfy_per_key_checksum_on_seek = + FLAGS_memtable_veirfy_per_key_checksum_on_seek; // Integrated BlobDB options.enable_blob_files = FLAGS_enable_blob_files; @@ -4220,10 +4544,14 @@ void InitializeOptionsFromFlags( StringToTemperature(FLAGS_default_temperature.c_str()); if (!FLAGS_file_temperature_age_thresholds.empty()) { + const std::string allowTrivialCopyBoolStr = + FLAGS_allow_trivial_copy_when_change_temperature ? "true" : "false"; Status s = GetColumnFamilyOptionsFromString( {}, options, "compaction_options_fifo={file_temperature_age_thresholds=" + - FLAGS_file_temperature_age_thresholds + "}", + FLAGS_file_temperature_age_thresholds + + ";allow_trivial_copy_when_change_temperature=" + + allowTrivialCopyBoolStr + "}", &options); if (!s.ok()) { fprintf(stderr, "While setting file_temperature_age_thresholds: %s\n", @@ -4257,7 +4585,6 @@ void InitializeOptionsFromFlags( options.best_efforts_recovery = FLAGS_best_efforts_recovery; options.paranoid_file_checks = FLAGS_paranoid_file_checks; - options.fail_if_options_file_error = FLAGS_fail_if_options_file_error; if (FLAGS_user_timestamp_size > 0) { CheckAndSetOptionsForUserTimestamp(options); @@ -4317,10 +4644,9 @@ void InitializeOptionsFromFlags( options.inplace_update_support = FLAGS_inplace_update_support; options.uncache_aggressiveness = FLAGS_uncache_aggressiveness; - // Remote Compaction - if (FLAGS_enable_remote_compaction) { - options.compaction_service = std::make_shared(); - } + options.memtable_op_scan_flush_trigger = FLAGS_memtable_op_scan_flush_trigger; + options.compaction_options_universal.reduce_file_locking = + FLAGS_universal_reduce_file_locking; } void InitializeOptionsGeneral( @@ -4331,8 +4657,8 @@ void InitializeOptionsGeneral( options.create_missing_column_families = true; options.create_if_missing = true; - if (!options.statistics) { - options.statistics = dbstats; + if (FLAGS_statistics) { + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); } if (options.env == Options().env) { @@ -4405,6 +4731,16 @@ void InitializeOptionsGeneral( if (sqfc_factory && !sqfc_factory->GetConfigs().IsEmptyNotFound()) { options.table_properties_collector_factories.emplace_back(sqfc_factory); } + + // Add CompactOnDeletionCollectorFactory if enabled + if (FLAGS_enable_compaction_on_deletion_trigger) { + options.table_properties_collector_factories.emplace_back( + ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory( + FLAGS_compaction_on_deletion_window_size, + FLAGS_compaction_on_deletion_trigger_count, + FLAGS_compaction_on_deletion_ratio, + FLAGS_compaction_on_deletion_min_file_size)); + } } } // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index bba5c1665f13..d97aadf9e60e 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -14,6 +14,7 @@ #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_shared_state.h" #include "rocksdb/experimental.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { class SystemClock; @@ -25,6 +26,13 @@ using experimental::SstQueryFilterConfigsManager; class StressTest { public: + static bool IsErrorInjectedAndRetryable(const Status& error_s) { + assert(!error_s.ok()); + return error_s.getState() && + FaultInjectionTestFS::IsInjectedError(error_s) && + !status_to_io_status(Status(error_s)).GetDataLoss(); + } + StressTest(); virtual ~StressTest() {} @@ -53,6 +61,7 @@ class StressTest { Status s = db_->EnableAutoCompaction(column_families_); return s; } + Options GetOptions(int cf_id); void CleanUp(); protected: @@ -274,6 +283,10 @@ class StressTest { return Status::NotSupported(); } + Status TestMultiScan(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys); + // Enum used by VerifyIterator() to identify the mode to validate. enum LastIterateOp { kLastOpSeek, @@ -319,6 +332,8 @@ class StressTest { Status TestDisableManualCompaction(ThreadState* thread); + Status TestAbortAndResumeCompactions(ThreadState* thread); + void TestAcquireSnapshot(ThreadState* thread, int rand_column_family, const std::string& keystr, uint64_t i); @@ -345,13 +360,6 @@ class StressTest { return Status::NotSupported("TestCustomOperations() must be overridden"); } - bool IsErrorInjectedAndRetryable(const Status& error_s) const { - assert(!error_s.ok()); - return error_s.getState() && - FaultInjectionTestFS::IsInjectedError(error_s) && - !status_to_io_status(Status(error_s)).GetDataLoss(); - } - void ProcessStatus(SharedState* shared, std::string msg, const Status& s, bool ignore_injected_error = true) const; @@ -396,6 +404,7 @@ class StressTest { std::shared_ptr cache_; std::shared_ptr compressed_cache_; std::shared_ptr filter_policy_; + std::unique_ptr db_owner_; DB* db_; TransactionDB* txn_db_; OptimisticTransactionDB* optimistic_txn_db_; @@ -414,7 +423,7 @@ class StressTest { std::atomic db_preload_finished_; std::shared_ptr sqfc_factory_; - DB* secondary_db_; + std::unique_ptr secondary_db_; std::vector secondary_cfhs_; bool is_db_stopped_; }; diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index ca43b699c8f9..15b52b827b14 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -53,12 +53,6 @@ int db_stress_tool(int argc, char** argv) { SetupSyncPointsToMockDirectIO(); } #endif - if (FLAGS_statistics) { - dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics(); - if (FLAGS_test_secondary) { - dbstats_secondaries = ROCKSDB_NAMESPACE::CreateDBStatistics(); - } - } compression_type_e = StringToCompressionType(FLAGS_compression_type.c_str()); bottommost_compression_type_e = StringToCompressionType(FLAGS_bottommost_compression_type.c_str()); @@ -100,10 +94,39 @@ int db_stress_tool(int argc, char** argv) { raw_env = fault_env_guard.get(); } - env_wrapper_guard = std::make_shared( - raw_env, std::make_shared(raw_env->GetFileSystem())); + auto db_stress_fs = + std::make_shared(raw_env->GetFileSystem()); + env_wrapper_guard = + std::make_shared(raw_env, db_stress_fs); db_stress_env = env_wrapper_guard.get(); + // Handle --destroy_db_and_exit early, before other option validation + if (FLAGS_destroy_db_and_exit) { + s = DbStressDestroyDb(FLAGS_db); + if (s.ok()) { + fprintf(stdout, "Successfully destroyed db at %s\n", FLAGS_db.c_str()); + return 0; + } else { + fprintf(stderr, "Failed to destroy db at %s: %s\n", FLAGS_db.c_str(), + s.ToString().c_str()); + return 1; + } + } + + // Handle --delete_dir_and_exit early, before other option validation + if (!FLAGS_delete_dir_and_exit.empty()) { + s = DestroyDir(raw_env, FLAGS_delete_dir_and_exit); + if (s.ok()) { + fprintf(stdout, "Successfully deleted directory %s\n", + FLAGS_delete_dir_and_exit.c_str()); + return 0; + } else { + fprintf(stderr, "Failed to delete directory %s: %s\n", + FLAGS_delete_dir_and_exit.c_str(), s.ToString().c_str()); + return 1; + } + } + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); // The number of background threads should be at least as much the diff --git a/db_stress_tool/expected_value.h b/db_stress_tool/expected_value.h index 428c389cb66e..7aed38240f09 100644 --- a/db_stress_tool/expected_value.h +++ b/db_stress_tool/expected_value.h @@ -253,20 +253,20 @@ class PendingExpectedValue { class ExpectedValueHelper { public: // Return whether the key associated with `pre_read_expected_value` and - // `post_read_expected_value` is expected not to exist from begining till the + // `post_read_expected_value` is expected not to exist from beginning till the // end of the read // // The negation of `MustHaveNotExisted()` is "may have not existed". - // To assert some key must have existsed, please use `MustHaveExisted()` + // To assert some key must have existed, please use `MustHaveExisted()` static bool MustHaveNotExisted(ExpectedValue pre_read_expected_value, ExpectedValue post_read_expected_value); // Return whether the key associated with `pre_read_expected_value` and - // `post_read_expected_value` is expected to exist from begining till the end + // `post_read_expected_value` is expected to exist from beginning till the end // of the read. // // The negation of `MustHaveExisted()` is "may have existed". - // To assert some key must have not existsed, please use + // To assert some key must have not existed, please use // `MustHaveNotExisted()` static bool MustHaveExisted(ExpectedValue pre_read_expected_value, ExpectedValue post_read_expected_value); diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 616035a1b4fe..c9d3250a119a 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -233,6 +233,14 @@ class NonBatchedOpsStressTest : public StressTest { } Status s = secondary_db_->TryCatchUpWithPrimary(); +#ifndef NDEBUG + uint64_t manifest_num = + static_cast_with_check(secondary_db_.get()) + ->TEST_Current_Manifest_FileNo(); +#else + uint64_t manifest_num = 0; +#endif + if (!s.ok()) { VerificationAbort(shared, "Secondary failed to catch up to the primary"); @@ -267,9 +275,11 @@ class NonBatchedOpsStressTest : public StressTest { assert(!pre_read_expected_values.empty() && static_cast(i - start) < pre_read_expected_values.size()); - VerifyValueRange(static_cast(cf), i, options, shared, from_db, - /* msg_prefix */ "Secondary get verification", s, - pre_read_expected_values[i - start]); + VerifyValueRange( + static_cast(cf), i, options, shared, from_db, + /* msg_prefix */ "Secondary get verification, manifest: " + + std::to_string(manifest_num), + s, pre_read_expected_values[i - start]); } } } else if (method == VerificationMethod::kGetEntity) { @@ -1600,12 +1610,6 @@ class NonBatchedOpsStressTest : public StressTest { Slice ub_slice; ReadOptions ro_copy = read_opts; - // There is a narrow window in iterator auto refresh run where injected read - // errors are simply untraceable, ex. failure to delete file as a part of - // superversion cleanup callback invoked by the DBIter destructor. - bool ignore_injected_read_error_in_iter = - ro_copy.auto_refresh_iterator_with_snapshot; - // Randomly test with `iterate_upper_bound` and `prefix_same_as_start` // // Get the next prefix first and then see if we want to set it to be the @@ -1698,8 +1702,7 @@ class NonBatchedOpsStressTest : public StressTest { FaultInjectionIOType::kRead), fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kMetadataRead)); - if (!ignore_injected_read_error_in_iter && - !SharedState::ignore_read_error && injected_error_count > 0 && + if (!SharedState::ignore_read_error && injected_error_count > 0 && s.ok()) { // Grab mutex so multiple thread don't try to print the // stack trace at the same time @@ -1852,7 +1855,17 @@ class NonBatchedOpsStressTest : public StressTest { } else if (FLAGS_use_merge) { if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { - s = db_->Merge(write_opts, cfh, k, v); + if (FLAGS_ingest_wbwi_one_in && + thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) { + auto wbwi = std::make_shared( + options_.comparator, 0, /*overwrite_key=*/true); + s = wbwi->Merge(cfh, k, v); + if (s.ok()) { + s = db_->IngestWriteBatchWithIndex(write_opts, wbwi); + } + } else { + s = db_->Merge(write_opts, cfh, k, v); + } } else { s = db_->Merge(write_opts, cfh, k, write_ts, v); } @@ -1864,7 +1877,17 @@ class NonBatchedOpsStressTest : public StressTest { } else { if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { - s = db_->Put(write_opts, cfh, k, v); + if (FLAGS_ingest_wbwi_one_in && + thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) { + auto wbwi = std::make_shared( + options_.comparator, 0, /*overwrite_key=*/true); + s = wbwi->Put(cfh, k, v); + if (s.ok()) { + s = db_->IngestWriteBatchWithIndex(write_opts, wbwi); + } + } else { + s = db_->Put(write_opts, cfh, k, v); + } } else { s = db_->Put(write_opts, cfh, k, write_ts, v); } @@ -1882,6 +1905,17 @@ class NonBatchedOpsStressTest : public StressTest { } while (!s.ok() && IsErrorInjectedAndRetryable(s) && initial_wal_write_may_succeed); + if ((s.IsDeadlock() || s.IsTimedOut()) && + (FLAGS_use_multiget || FLAGS_use_multi_get_entity)) { + // Deadlock or timeout is ok, when multi get is tested. Because multi get + // tests execute MaybeAddKeyToTxnForRYW function which writes to the + // same key space but does not acquire stress test level mutex. So it is + // possible RocksDB returns deadlock or timeout. Return OK() for these + // cases + pending_expected_value.Rollback(); + return Status::OK(); + } + if (!s.ok()) { pending_expected_value.Rollback(); if (IsErrorInjectedAndRetryable(s)) { @@ -1956,7 +1990,17 @@ class NonBatchedOpsStressTest : public StressTest { } if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { - s = db_->Delete(write_opts, cfh, key); + if (FLAGS_ingest_wbwi_one_in && + thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) { + auto wbwi = std::make_shared( + options_.comparator, 0, /*overwrite_key=*/true); + s = wbwi->Delete(cfh, key); + if (s.ok()) { + s = db_->IngestWriteBatchWithIndex(write_opts, wbwi); + } + } else { + s = db_->Delete(write_opts, cfh, key); + } } else { s = db_->Delete(write_opts, cfh, key, write_ts); } @@ -2013,7 +2057,17 @@ class NonBatchedOpsStressTest : public StressTest { } if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { - s = db_->SingleDelete(write_opts, cfh, key); + if (FLAGS_ingest_wbwi_one_in && + thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) { + auto wbwi = std::make_shared( + options_.comparator, 0, /*overwrite_key=*/true); + s = wbwi->SingleDelete(cfh, key); + if (s.ok()) { + s = db_->IngestWriteBatchWithIndex(write_opts, wbwi); + } + } else { + s = db_->SingleDelete(write_opts, cfh, key); + } } else { s = db_->SingleDelete(write_opts, cfh, key, write_ts); } @@ -3114,13 +3168,15 @@ class NonBatchedOpsStressTest : public StressTest { Status s; + ExpectedValue new_expected_value; + switch (op) { case Op::PutOrPutEntity: case Op::Merge: { ExpectedValue put_value; put_value.SyncPut(static_cast(thread->rand.Uniform( static_cast(ExpectedValue::GetValueBaseMask())))); - ryw_expected_values[k] = put_value; + new_expected_value = put_value; const uint32_t value_base = put_value.GetValueBase(); @@ -3144,7 +3200,7 @@ class NonBatchedOpsStressTest : public StressTest { case Op::Delete: { ExpectedValue delete_value; delete_value.SyncDelete(); - ryw_expected_values[k] = delete_value; + new_expected_value = delete_value; s = txn->Delete(cfh, k); break; @@ -3153,6 +3209,20 @@ class NonBatchedOpsStressTest : public StressTest { assert(false); } + // It is possible that multiple thread concurrently try to write to the + // same key, which could cause lock timeout or deadlock in the + // transactiondb layer, before transaction is rolled back. + // E.g. + // Timestamp 1: Transaction A: lock key M for write + // Timestamp 2: Transaction B: lock key N for write + // Timestamp 3: Transaction B: try to lock key M for write -> wait + // Timestamp 4: Transaction A: try to lock key N for write -> deadlock + if (s.IsTimedOut() || s.IsDeadlock()) { + return; + } + + ryw_expected_values[k] = new_expected_value; + if (!s.ok()) { fprintf(stderr, "Transaction write error in read-your-own-write test: %s\n", diff --git a/docs/_data/authors.yml b/docs/_data/authors.yml index 256f4c07ff65..0bc79ad80de6 100644 --- a/docs/_data/authors.yml +++ b/docs/_data/authors.yml @@ -1,3 +1,5 @@ +# Note: standardize on github user names here. fbid is optional and was used +# to use author's profile picture from Facebook icanadi: full_name: Igor Canadi fbid: 706165749 @@ -26,7 +28,7 @@ lgalanis: full_name: Leonidas Galanis fbid: 8649950 -sdong: +siying: full_name: Siying Dong fbid: 9805119 @@ -83,3 +85,19 @@ zjay: hx235: full_name: Hui Xiao fbid: 100037058588280 + +pdillinger: + full_name: Peter Dillinger + fbid: 513108 + +alanpaxton: + full_name: Alan Paxton + +akankshamahajan15: + full_name: Akanksha Mahajan + +anand1976: + full_name: Anand Ananthabhotla + +poojam23: + full_name: Pooja Malik diff --git a/docs/_posts/2014-05-14-lock.markdown b/docs/_posts/2014-05-14-lock.markdown index 12009cc88c11..66bf05dc4736 100644 --- a/docs/_posts/2014-05-14-lock.markdown +++ b/docs/_posts/2014-05-14-lock.markdown @@ -1,7 +1,7 @@ --- title: Reducing Lock Contention in RocksDB layout: post -author: sdong +author: siying category: blog redirect_from: - /blog/521/lock/ diff --git a/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown b/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown index 6a641f23353c..ed03b0273233 100644 --- a/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown +++ b/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown @@ -1,7 +1,7 @@ --- title: PlainTable — A New File Format layout: post -author: sdong +author: siying category: blog redirect_from: - /blog/599/plaintable-a-new-file-format/ diff --git a/docs/_posts/2015-02-27-write-batch-with-index.markdown b/docs/_posts/2015-02-27-write-batch-with-index.markdown index 7f9f77653655..770ee0581651 100644 --- a/docs/_posts/2015-02-27-write-batch-with-index.markdown +++ b/docs/_posts/2015-02-27-write-batch-with-index.markdown @@ -1,7 +1,7 @@ --- title: 'WriteBatchWithIndex: Utility for Implementing Read-Your-Own-Writes' layout: post -author: sdong +author: siying category: blog redirect_from: - /blog/1901/write-batch-with-index/ diff --git a/docs/_posts/2015-07-23-dynamic-level.markdown b/docs/_posts/2015-07-23-dynamic-level.markdown index 0ff3a0542f82..1bc41b2fb3a4 100644 --- a/docs/_posts/2015-07-23-dynamic-level.markdown +++ b/docs/_posts/2015-07-23-dynamic-level.markdown @@ -1,7 +1,7 @@ --- title: Dynamic Level Size for Level-Based Compaction layout: post -author: sdong +author: siying category: blog redirect_from: - /blog/2207/dynamic-level/ diff --git a/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown b/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown index b21b04fe3869..7e5eb03582d6 100644 --- a/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown +++ b/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown @@ -1,7 +1,7 @@ --- title: Analysis File Read Latency by Level layout: post -author: sdong +author: siying category: blog redirect_from: - /blog/2537/analysis-file-read-latency-by-level/ diff --git a/docs/_posts/2016-01-29-compaction_pri.markdown b/docs/_posts/2016-01-29-compaction_pri.markdown index ba9ee627c91d..955e0849c95f 100644 --- a/docs/_posts/2016-01-29-compaction_pri.markdown +++ b/docs/_posts/2016-01-29-compaction_pri.markdown @@ -1,7 +1,7 @@ --- title: Option of Compaction Priority layout: post -author: sdong +author: siying category: blog redirect_from: - /blog/2921/compaction_pri/ diff --git a/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown b/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown index 409015cc8c8c..927121bac173 100644 --- a/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown +++ b/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown @@ -1,7 +1,7 @@ --- title: RocksDB 4.2 Release! layout: post -author: sdong +author: siying category: blog redirect_from: - /blog/3017/rocksdb-4-2-release/ diff --git a/docs/_posts/2016-02-25-rocksdb-ama.markdown b/docs/_posts/2016-02-25-rocksdb-ama.markdown index 2ba04f39a18e..31792552fc29 100644 --- a/docs/_posts/2016-02-25-rocksdb-ama.markdown +++ b/docs/_posts/2016-02-25-rocksdb-ama.markdown @@ -1,7 +1,7 @@ --- title: RocksDB AMA layout: post -author: yhchiang +author: yhciang category: blog redirect_from: - /blog/3065/rocksdb-ama/ diff --git a/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown b/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown index 247768d307b4..b29a9bd3649f 100644 --- a/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown +++ b/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown @@ -1,7 +1,7 @@ --- title: RocksDB 4.5.1 Released! layout: post -author: sdong +author: siying category: blog redirect_from: - /blog/3179/rocksdb-4-5-1-released/ diff --git a/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown b/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown index 87c20eb47d43..11760cc82560 100644 --- a/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown +++ b/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown @@ -1,7 +1,7 @@ --- title: RocksDB 4.11.2 Released! layout: post -author: sdong +author: siying category: blog --- We abandoned release candidates 4.10.x and directly go to 4.11.2 from 4.9, to make sure the latest release is stable. In 4.11.2, we fixed several data corruption related bugs introduced in 4.9.0. diff --git a/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown b/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown index c6ce27d64db4..87fe0c050e0b 100644 --- a/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown +++ b/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown @@ -1,7 +1,7 @@ --- title: RocksDB 5.2.1 Released! layout: post -author: sdong +author: siying category: blog --- diff --git a/docs/_posts/2021-04-12-universal-improvements.markdown b/docs/_posts/2021-04-12-universal-improvements.markdown index fa4e9d463b23..f6bf64b2da8e 100644 --- a/docs/_posts/2021-04-12-universal-improvements.markdown +++ b/docs/_posts/2021-04-12-universal-improvements.markdown @@ -1,7 +1,7 @@ --- title: (Call For Contribution) Make Universal Compaction More Incremental layout: post -author: sdong +author: siying category: blog --- diff --git a/docs/_posts/2021-05-26-online-validation.markdown b/docs/_posts/2021-05-26-online-validation.markdown index 33e9dfc151ac..9314630b0705 100644 --- a/docs/_posts/2021-05-26-online-validation.markdown +++ b/docs/_posts/2021-05-26-online-validation.markdown @@ -1,7 +1,7 @@ --- title: Online Validation layout: post -author: sdong +author: siying category: blog --- To prevent or mitigate data corrution in RocksDB when some software or hardware issues happens, we keep adding online consistency checks and improving existing ones. diff --git a/docs/_posts/2025-09-24-unified-memory-tracking.markdown b/docs/_posts/2025-09-24-unified-memory-tracking.markdown new file mode 100644 index 000000000000..dba0ca488eb8 --- /dev/null +++ b/docs/_posts/2025-09-24-unified-memory-tracking.markdown @@ -0,0 +1,59 @@ +--- +title: Unified Memory Tracking +layout: post +author: hx235 +category: blog +--- + +## Context / Problem +Modern RocksDB deployments often run in environments with strict memory constraints—cloud VMs, containers, or hosts with hundreds of DB instances. Unpredictable memory usage can lead to out-of-memory (OOM) errors, degraded performance, or even service outages. +Historically, while the block cache was the main source of memory usage, other components—such as memtables, table readers, file metadata, and temporary buffers—could consume significant memory outside the block cache’s control. This made it difficult for users to set a single memory limit and guarantee resource usage stays within expectations. + +## Goal +The goal of recent memory tracking work in RocksDB is to enable users to cap the total memory usage of RocksDB instances under a single, configurable limit—the block cache capacity. This is achieved by: +- **Tracking and charging** all major memory consumers (memtables, table readers, file metadata, compression buffers, filter construction) to the block cache. +- **Evicting** data blocks or other memory when the total tracked usage exceeds the configured limit. +- **Providing a fixed memory footprint** for RocksDB, making it easier to run in resource-constrained environments and avoid OOMs. + +## Memtable Memory Charging +A major source of memory usage in RocksDB is the memtable. To ensure memtable memory is tracked and capped under a single limit, RocksDB provides the WriteBufferManager (WBM). When WBM is configured with a block cache, memtable memory usage is charged to the block cache. This helps prevent OOM errors and simplifies resource management. + +```cpp +std::shared_ptr cache = HyperClockCacheOptions(capacity).MakeSharedCache();; +DBOptions db_options; +db_options.write_buffer_manager = std::make_shared(.., cache); +``` + +## Other Memory Charging +Beyond memtables, RocksDB allows users to control memory charging for other internal roles using the cache_usage_options API. This provides fine-grained control over how memory is tracked for components like table readers, file metadata, compression dictionary buffers (`CompressionOptions::max_dict_buffer_bytes:`) and filter construction. + +```cpp +struct CacheEntryRoleOptions { + enum class Decision { + kEnabled, + kDisabled, + kFallback, + }; + Decision charged = Decision::kFallback; +}; +struct CacheUsageOptions { + CacheEntryRoleOptions options; + std::map options_overrides; +}; + +... +BlockBasedTableOptions table_options; +table_options.cache_usage_options.options.charged = CacheEntryRoleOptions::Decision::kFallback; +table_options.cache_usage_options.options_overrides[CacheEntryRole::kTableBuilder] = { + .charged = CacheEntryRoleOptions::Decision::kEnabled, +}; +``` + +Default (`Decision::kFallback`) behavior for each memory type: +- `CacheEntryRole::kCompressionDictionaryBuildingBuffer`: `kEnabled` +- `CacheEntryRole::kFilterConstruction`: `kDisabled` +- `CacheEntryRole::kBlockBasedTableReader`: `kDisabled` +- `CacheEntryRole::kFileMetadata`: `kDisabled` + +## Monitoring and Observability +RocksDB provides built-in statistics to help users monitor memory usage and cache behavior. The `DB::Properties::kBlockCacheEntryStats` exposes detailed statistics about block cache entries, including breakdowns by each `CacheEntryRole`. These statistics are essential for understanding memory consumption and tuning cache configuration. diff --git a/docs/_posts/2025-09-25-io-tagging.markdown b/docs/_posts/2025-09-25-io-tagging.markdown new file mode 100644 index 000000000000..14651d03f0e9 --- /dev/null +++ b/docs/_posts/2025-09-25-io-tagging.markdown @@ -0,0 +1,74 @@ +--- +title: IO Activity Tagging +layout: post +author: hx235 +category: blog +--- + +## Context + +RocksDB performs a variety of IO operations—user reads, background compactions, flushes, database opens, and verification tasks. Treating all these operations the same makes it difficult for file system implementers to optimize performance, prioritize latency-sensitive IOs, and diagnose bottlenecks. To solve that, RocksDB internally tags every IO operation with its activity type using the `IOActivity` enum. This automatic tagging provides precise context for each IO, enabling file systems to make smarter, context-aware decisions for scheduling, caching, and resource management. + +## How Internal IO Tagging Works +RocksDB automatically assigns an `IOActivity` tag to each IO operation. This tag is propagated through the storage stack and included in the IO options passed to the file system. + +```cpp +enum class IOActivity : uint8_t { + kFlush = 0, // IO for flush operations (background write) + kCompaction = 1, // IO for compaction (background read/write) + kDBOpen = 2, // IO during database open (read/write) + kGet = 3, // User Get() read + kMultiGet = 4, // User MultiGet() read + kDBIterator = 5, // User iterator read + kVerifyDBChecksum = 6, // Verification: DB checksum + kVerifyFileChecksums = 7, // Verification: file checksums + kGetEntity = 8, // Entity Get (e.g., wide-column) + kMultiGetEntity = 9, // Entity MultiGet + kGetFileChecksumsFromCurrentManifest = 10, // Manifest checksum reads + // 0x80–0xFE: Reserved for custom/internal use + kUnknown = 0xFF // Unknown/unspecified activity +}; +``` + +## Access IO Tag in File System +Custom file systems can access the IOActivity tag via the IO options structure provided by RocksDB. This allows them to optimize behavior based on the specific IO activity. + +```cpp +Status CustomFileSystem::Append(uint64_t offset, const Slice& data, const IOOptions& io_opts, ...) { + switch (io_opts.io_activity) { + case Env::IOActivity::kGet: + // Prioritize or cache user reads + break; + case Env::IOActivity::kCompaction: + // Throttle or deprioritize background compaction IO + break; + case Env::IOActivity::kDBOpen: + // Track or optimize DB open IO + break; + // ... handle other activities ... + default: + // Default handling + break; + } +} +``` +## IO Activity Statistics in RocksDB +RocksDB provides detailed histograms for IO activities, allowing you to analyze both the aggregate time spent (in microseconds) and the count of IOs for each activity type. +```cpp +// Read Histograms +FILE_READ_FLUSH_MICROS +FILE_READ_COMPACTION_MICROS +FILE_READ_DB_OPEN_MICROS +FILE_READ_GET_MICROS +FILE_READ_MULTIGET_MICROS +FILE_READ_DB_ITERATOR_MICROS +FILE_READ_VERIFY_DB_CHECKSUM_MICROS +FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS + +// Write Histograms +FILE_WRITE_FLUSH_MICROS +FILE_WRITE_COMPACTION_MICROS +FILE_WRITE_DB_OPEN_MICROS +``` + +Thanks to Maciej Szeszko and Andrew Chang from the RocksDB team for their contributions in expanding and maintaining the IOActivity enum. diff --git a/docs/_posts/2025-10-08-parallel-compression-revamp.markdown b/docs/_posts/2025-10-08-parallel-compression-revamp.markdown new file mode 100644 index 000000000000..42386e5c941a --- /dev/null +++ b/docs/_posts/2025-10-08-parallel-compression-revamp.markdown @@ -0,0 +1,89 @@ +--- +title: "Parallel Compression Revamp: Dramatically Reduced CPU Overhead" +layout: post +author: pdillinger +category: blog +--- + +The upcoming RocksDB 10.7 release includes a major revamp of parallel compression that **dramatically reduces the feature's CPU overhead by up to 65%** while maintaining or improving throughput for compression-heavy workloads. We expect this to broaden the set of workloads that could benefit from parallel compression, especially for **bulk SST generation and remote compaction use cases** that are less sensitive to CPU responsiveness. + +## Background + +Parallel compression in RocksDB (`CompressionOptions::parallel_threads > 1`) allows multiple threads to compress different blocks simultaneously during SST file generation, which can significantly improve compaction throughput for workloads where compression is a bottleneck. However, the original implementation had substantial CPU overhead that often outweighed the benefits, limiting its practical adoption. + +## What's New: A Complete Reimplementation + +The parallel compression framework has been completely rewritten from the ground up in [pull request #13910](https://github.com/facebook/rocksdb/pull/13910) to address the core inefficiencies: + +### Ring Buffer Architecture +Instead of separate compression and write queues with complex thread coordination, the new implementation uses a ring buffer of blocks-in-progress that enables efficient work distribution across threads. This bounds working memory while enabling high throughput with minimal cross-thread synchronization. + +![Ring Buffer Architecture](/static/images/parallel-compression/ring-buffer-architecture.svg) + +### Work-Stealing Design +Previously, the calling thread could only generate uncompressed blocks, dedicated compression threads could only compress, and a writer thread could only write the SST file to storage. Now, all threads can participate in compression work in a quasi-work-stealing manner, dramatically reducing the need for threads to block waiting for work. While only one thread (the calling thread or "emit thread") can generate uncompressed SST blocks in the new implementation, feeding compression work to other threads and itself, all other threads are compatible with writing compressed blocks to storage. + +### Auto-Scaling Thread Management +The ring buffer enables another key feature: auto-scaling of active threads based on ring buffer utilization. The framework intelligently wakes up idle worker threads only when there's sufficient work to justify the overhead, achieving near-maximum throughput while minimizing CPU waste from unnecessary thread wake-ups. + +### Lock-Free Synchronization +The entire framework is now lock-free (and wait-free as long as compatible work units are available for each thread), based primarily on atomic operations. To cleanly pack and leverage many data fields into a single atomic value, I've developed a new `BitFields` utility API. This is proving useful for cleaning up the HyperClockCache implementation as well, and will be the topic of a later blog post. + +Semaphores are used for lock-free management of idle threads (assuming a lock-free semaphore implementation, which is likely the case with `ROCKSDB_USE_STD_SEMAPHORES` but that is untrustworthy; see below). + +## Performance Improvements + +The results speak for themselves. Here's a comparison using `db_bench` fillseq benchmarks with various compression configurations: + +### ZSTD Compression (Default Level) +Note: +* "throughput" = how quickly a given CPU-bound flush or compaction can complete +* "CPU increase" = total CPU usage in amount of time that each core was used +* "PT" = parallel_threads setting. + +**Before:** +- PT=3: ~38% throughput increase for ~73% CPU increase +- PT=6: No throughput increase for ~70% CPU increase + +**After:** +- PT=3: ~58% throughput increase for ~25% CPU increase +- PT=6: ~58% throughput increase for ~28% CPU increase + +### High Compression Scenarios +For ZSTD compression level 8, the improvements are even more dramatic: + +**Before:** +- PT=4: 2.6x throughput increase for 139% CPU increase +- PT=8: 3.6x throughput increase for 135% CPU increase + +**After:** +- PT=4: 2.8x throughput increase for 114% CPU increase +- PT=8: 3.7x throughput increase for 116% CPU increase + +## Compression Algorithm Optimizations + +Alongside the parallel compression revamp, some optimizations have gone into the underlying compression implementations/integrations. Most notably, **LZ4HC received dramatic performance improvements** through better reuse of internal data structures between compression calls (detailed in [pull request #13805](https://github.com/facebook/rocksdb/pull/13805)). A small regression in LZ4 performance from that change was fixed in [pull request #14017](https://github.com/facebook/rocksdb/pull/14017). + +While **ZSTD remains the gold standard** for medium-to-high compression ratios in RocksDB, these LZ4HC optimizations make it an increasingly attractive option for read-heavy workloads where LZ4's faster decompression can provide overall performance benefits. + +## Production Ready + +With these efficiency improvements, parallel compression is now considered **production-ready**. The feature has been thoroughly tested in both unit tests and stress testing, including validation on high-load scenarios with hundreds of concurrent compression jobs and thousands of threads. + +Some notes on current limitations: +- Parallel compression is currently incompatible with `UserDefinedIndex` and with the deprecated `decouple_partitioned_filters=false` setting +- Maximum performance is available with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, though this is not currently recommended due to reported bugs in some implementations of C++20 semaphores + +## Configuration Recommendations + +The dramatically reduced CPU overhead means parallel compression is now viable for a broader range of workloads, particularly those using higher compression levels or compression-heavy scenarios like time-series data. However, simply enabling parallel compression could result in more *spiky* CPU loads for hosts serving live DB data. **Parallel compression might be most useful for bulk SST file generation and/or remote compaction workloads** because they are less sensitive to CPU responsiveness. In these scenarios there is little danger in setting `parallel_threads=8` even with the possibility of over-subscribing CPU cores, though the potentially safer "sweet spot" is typically around `parallel_threads=3`, depending on compression level, etc. + +## Limitations and Future + +Although this offers a great improvement in the implementation of an existing option, we recognize that this setup is suboptimal in a number of ways: +* There is no work sharing / thread pooling for these SST compression/writer threads among compactions in the same process, so not well able to fit the workload to available CPU cores and not able to use other SST file compression work to avoid a worker thread going to sleep. +* We are not (yet) using a framework that would allow micro-work sharing with things other than SST generation on a set of threads. That would be a good direction for effective sharing of CPU resources without spikes in usage, but might incur intolerable CPU overhead in managing work. With this "hand optimized" and specialized framework, we can at least evaluate such future endeavors against a perhaps ideal framework in terms of parallelizing with minimal overhead. + +## Try It Out + +Parallel compression revamp will be available in RocksDB 10.7. As always, we recommend testing in your specific environment to determine the optimal configuration for your workload. diff --git a/docs/_posts/2025-12-31-bit-fields-api.markdown b/docs/_posts/2025-12-31-bit-fields-api.markdown new file mode 100644 index 000000000000..40d1b60f5326 --- /dev/null +++ b/docs/_posts/2025-12-31-bit-fields-api.markdown @@ -0,0 +1,279 @@ +--- +title: "BitFields API: Type-Safe Bit Packing for Lock-Free Data Structures" +layout: post +author: pdillinger +category: blog +--- + +Modern concurrent data structures increasingly rely on [atomic operations](https://en.cppreference.com/w/cpp/atomic/atomic) to avoid the overhead of locking. A valuable but under-utilized technique for maximizing the effectiveness of atomic operations is [bit packing](https://en.wikipedia.org/wiki/Bit_field)---fitting multiple logical fields into a single atomic variable for algorithmic simplicity and efficiency. However, language support for bit packing does not guarantee dense packing, and manually managing bit manipulation quickly becomes error-prone, especially when dealing with complex state machines. + +To address this in RocksDB, we have developed a reusable **BitFields API**, a type-safe, zero-overhead abstraction for bit packing in C++. This works in conjunction with clean wrappers for `std::atomic` for powerful and relatively safe bit-packing of atomic data. For broader use, a [variant of the code](https://github.com/facebook/folly/pull/2549) has been proposed for adding to folly. + +## The Problem: Managing Packed Bit Fields + +Consider HyperClockCache, an essentially lock-free cache implementation in RocksDB, which was [refactored to use this BitFields API](https://github.com/facebook/rocksdb/pull/14154). It is a hash table built on *slots* that can each hold a cache entry and relevant metadata. For atomic simplicity and efficiency, all the essential metadata for each slot is packed into a single 64-bit value: +- The reference count and eviction metadata are together encoded into *acquire* and *release* counters, 30 bits each. +- The possible states of {*empty*, *under construction/destruction*, *occupied+visible*, and *occupied+invisible*} are encoded into three state bits (instead of two, for easier decoding and manipulation). +- A *hit* bit is used for secondary cache integration. + +Traditionally, you might write code like this: + +```cpp +// Old approach: manual bit manipulation +constexpr uint64_t kAcquireCounterShift = 0; +constexpr uint64_t kReleaseCounterShift = 30; +constexpr uint64_t kCounterMask = 0x3FFFFFFF; +constexpr uint64_t kHitBitShift = 60; +constexpr uint64_t kOccupiedShift = 61; +constexpr uint64_t kShareableShift = 62; +constexpr uint64_t kVisibleShift = 63; +constexpr uint64_t kStateShift = kOccupiedShift; + +std::atomic meta_; + +bool IsUnderConstruction(uint64_t meta) const { + return (meta & (uint64_t{1} << kOccupiedShift)) && !(meta & (uint64_t{1} << kShareableShift)); +} + +// Getting fields +uint64_t meta = meta_.load(std::memory_order_acquire); +if (IsUnderConstruction(meta)) { + // ... +} else if ((meta >> kVisibleShift) & 1) { + uint32_t refcount = + static_cast(((meta >> kAcquireCounterShift) - + (meta >> kReleaseCounterShift)) & kCounterMask); + // ... +} + + +// Setting fields + +// Set the hit bit (relaxed) +meta_.fetch_or(uint64_t{1} << kHitBitShift, std::memory_order_relaxed); + +// Set both counters to `new_count` (as in eviction processing) +uint64_t meta = meta_.load(std::memory_order_relaxed); +uint64_t new_meta = + (meta & ((uint64_t{1} << kHitBitShift) | (uint64_t{7} << kStateShift))) | + (new_count << kReleaseCounterShift) | + (new_count << kAcquireCounterShift); +bool success = meta_.compare_exchange_strong(meta, new_meta, + std::memory_order_acq_rel); + +// Increment acquire counter by initial_countdown +old_meta = meta_.fetch_add((uint64_t{1} << kAcquireCounterShift) * initial_countdown, + std::memory_order_acq_rel); +``` + +This approach has several problems: +1. **Error-prone**: Easy to get masks and shifts wrong +2. **Maintenance burden**: Changes to field sizes require updating multiple constants +3. **Abstraction challenges**: Even if writing a full set of well-tested getters and setters to hide all the details, details can leak in to do things like update multiple fields in one non-CAS (compare-and-swap) atomic operation. + +## New Solution: BitFields API + +The BitFields API provides a declarative, type-safe way to define bit-packed structures. Here's how the same example looks with BitFields: + +```cpp +// New approach: declarative bit fields. (Each field must reference the +// previous, so that the declaration machinery is simply stateless.) +struct SlotMeta : public BitFields { + using AcquireCounter = UnsignedBitField; + using ReleaseCounter = UnsignedBitField; + using HitFlag = BoolBitField; + using OccupiedFlag = BoolBitField; + using ShareableFlag = BoolBitField; + using VisibleFlag = BoolBitField; + + // Convenience helpers + bool IsUnderConstruction() const { + return Get() && !Get(); + } +}; + +BitFieldsAtomic meta_; + +// Getting fields +SlotMeta state = meta_.Load(); +if (state.IsUnderConstruction()) { + // ... +} else if (state.Get()) { + uint32_t refcount = state.Get() - + state.Get(); + // ... +} + +// Setting fields + +// Set the hit bit (relaxed) +meta_.ApplyRelaxed(SlotMeta::HitFlag::SetTransform()); + +// Set both counters to `new_count` (as in eviction processing) +SlotMeta meta = meta_.LoadRelaxed(); +SlotMeta new_meta = meta; +new_meta.Set(new_count); +new_meta.Set(new_count); +meta_.CasStrongRelaxed(meta, new_meta); + +// Increment acquire counter by initial_countdown +auto add_acquire = + AcquireCounter::PlusTransformPromiseNoOverflow(initial_countdown); +meta_.Apply(add_acquire, &old_meta); + +// Bonus: Atomic multi-field updates without compare-exchange +auto transform = AcquireCounter::PlusTransformPromiseNoOverflow(1) + + ReleaseCounter::PlusTransformPromiseNoOverflow(1); +meta_.Apply(transform); +``` + +## Key Features + +### Type Safety and Self-Documentation + +Each field has a specific type (`bool` for `BoolBitField`, appropriately-sized unsigned int for `UnsignedBitField`) and clear semantic meaning. The field definitions are self-documenting: you can immediately see how many bits each field occupies and in what order. + +### [Zero Overhead](https://en.cppreference.com/w/cpp/language/Zero-overhead_principle) + +Because of heavy use of templates and constexpr operations and the ability to satisfy multiple field reads or writes from a single atomic operation, we have seen no runtime overhead vs. hand-written bit manipulation, in RocksDB. In one case, we verified the assembly code was identical. + +[For folly's LifoSem](https://github.com/facebook/folly/pull/2550), there was one case where an optimization hack with detected overflow from one field to another couldn't be replicated as efficiently with the BitFields API because it would violate overflow checking. For that case I dove into the underlying representation to bypass the BitFields overflow check. + +### Atomic Operations with Transforms + +One of the most powerful features is the ability to combine multiple field updates into a single atomic operation using "transforms", if they are all either (a) some combination of addition and subtraction, (b) bitwise-and, or (c) bitwise-or. For example: + +```cpp +// Clear several but not all fields atomically +auto and_transform = Field1::AndTransform(0) + + Field2::ClearTransform() + + Field4::ClearTransform(); +atomic_bitfields.Apply(and_transform, &old_state, &new_state); +... +// Set more than one boolean field atomically +auto or_transform = Field2::SetTransform() + + Field4::SetTransform(); +atomic_bitfields.Apply(or_transform, &old_state, &new_state); +... +auto add_transform = Field1::PlusTransformPromiseNoOverflow(1) + + Field3::MinusTransformPromiseNoUnderflow(1); +atomic_bitfields.Apply(add_transform, &old_state, &new_state); +``` + +Each `Apply()` generates a single atomic operation (e.g., `fetch_add` or `fetch_or`) that updates all the specified fields, and optionally returns both the old and new values. This enables a number of hacks for atomic updates without CAS. + +### Overflow Protection + +The API includes built-in overflow detection in debug builds: + +```cpp +// An assertion will fail in debug builds if the counter overflows +auto transform = Counter::PlusTransformPromiseNoOverflow(value); +atomic.Apply(transform); +``` + +For fields at the top of the underlying representation (where overflow doesn't affect other fields), overflow is explicitly ignored. (A compile time error is generated if you try to use `PlusTransformPromiseNoOverflow` on a field at the top of the representation or `PlusTransformIgnoreOverflow` on a field not at the top of the representation.) + +```cpp +// For wraparound counters +auto transform = Counter::PlusTransformIgnoreOverflow(value); +``` + +This capability is used in a folly data structure called LifoSem, which [I have proposed to refactor](https://github.com/facebook/folly/pull/2550) to a proposed BitFields API variant for folly. + +### Compare-and-Swap (CAS) Support + +The atomic wrappers provide full CAS support for lock-free algorithms: + +```cpp +SlotMeta expected = current_state; +SlotMeta desired = expected.With(new_value).With(true); +if (meta_.CasStrong(expected, desired)) { + // Successfully updated + ... +} +``` + +### Atomic wrappers + +The BitFields API includes two atomic wrappers: `RelaxedBitFieldsAtomic` and `BitFieldsAtomic`. However, RocksDB also has versions of these wrappers for regular `std::atomic` variables that help with memory ordering discipline: `RelaxedAtomic` and `Atomic` in `util/atomic.h`. + +These wrappers help in a couple of ways: +* **Self-document intended memory order**: An atomic field generally has a single memory order that all or most operations should use, typically either `std::memory_order_relaxed` or `std::memory_order_acq_rel`. +* **More intentional memory orders and atomic operations**: The standard library's implicit conversions and default memory ordering (`memory_order_seq_cst`) make it easy to accidentally use sequential consistency with acquire/release ordering or even relaxed, which could hurt performance, and tend to hide where atomic operations are actually happening (e.g. implicit vs. explicit load). + +For example, instead of writing: +```cpp +std::atomic stat_counter; +stat_counter++; // Uses memory_order_seq_cst implicitly - maybe inefficient +``` + +You write: +```cpp +RelaxedAtomic stat_counter; +stat_counter.FetchAddRelaxed(1); // Explicitly relaxed - appropriate for a diagnostic counter +``` + +Or for data providing synchronization: +```cpp +Atomic refcount; +refcount.FetchAdd(1); // Standard acquire-release semantics for coordinating with other threads +``` + +These wrappers complement the BitFields atomic wrappers by providing the same ordering discipline for non-packed atomic variables throughout much of RocksDB, creating a more readable and less clunky approach to concurrent programming. Migrating remaining uses of `std::atomic` is an ongoing effort. + +## Real-World Usage in RocksDB + +The BitFields API was developed along with the revamped parallel compression in RocksDB, but with the intention to also clean up the HyperClockCache (HCC) implementation. With that migration complete, we can see the benefits. Specifically, **by packing more of the state machine into a single atomic value, the parallel algorithms became both simpler and more efficient.** Concurrent algorithms that could have blown up in their state space with elaborate interleavings between threads trying not to block each other, e.g. because of multi-step consensus on work assignments, were instead able to quickly and more easily make progress, e.g. with atomically clear work assignments. + +### Before: Manual Bit Manipulation + +The old HCC code was difficult to read and maintain. Many of the common read and update operations had manually written helper functions, but it was not practical to develop the full set of functions needed for rare cases. Consider this code that clears the "visible" flag on a slot when an entry is erased from subsequent lookups but might still be referenced: + +```cpp +// Old HCC code, without atomic wrappers +uint64_t old_meta = + h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift), std::memory_order_acq_rel); +// Apply update to local copy +uint64_t new_meta = old_meta & ~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift); + +// New HCC code +SlotMeta old_meta, new_meta; +h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform(), &old_meta, &new_meta); +``` + +Or this assertion that the acquire and release counters are different: + +```cpp +// Old HCC code +uint64_t old_meta = ...; +assert(((old_meta >> ClockHandle::kAcquireCounterShift) & + ClockHandle::kCounterMask) != + ((old_meta >> ClockHandle::kReleaseCounterShift) & + ClockHandle::kCounterMask)); + +// New HCC code without single-purpose helper functions +SlotMeta old_meta = ...; +assert(old_meta.Get() != + old_meta.Get()); + +// New HCC code, with single-purpose helper functions +SlotMeta old_meta = ...; +assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter()); +``` + +Some hand-written helper functions or using directives are still useful for brevity, but even without them all the bit manipulation details are hidden in the BitFields implementation. + +## Future Directions + +We hope the proposed folly version is accepted to make the BitFields API available for broader usage. Additionally, some quality-of-life improvements are likely possible, perhaps including easier declaration and usage syntax, hopefully without delving into boost-like macro hell. Better runtime and compile time checks might also be possible. + +## Conclusion + +The BitFields API demonstrates that zero-overhead abstractions can significantly improve code quality without sacrificing performance. By providing type safety, self-documentation, and convenience features around bit manipulation and atomic operations, it makes lock-free programming more accessible and maintainable. Bit-packed atomics are arguably essential for *slaying the complexity dragon* of efficient lock-free and low-lock algorithms, because they reduce explosion in algorithm states. + +For RocksDB specifically, the migration to BitFields has made the HyperClockCache implementation substantially easier to understand and modify, while maintaining the same high-performance characteristics. Combined with the recent [parallel compression revamp](/blog/2025/10/08/parallel-compression-revamp.html), these improvements showcase our ongoing commitment to writing clean, efficient, and maintainable code. + +The BitFields API is available in RocksDB's util/bit_fields.h and can be adapted for use in other projects requiring efficient, type-safe bit packing. For those building high-performance concurrent systems, it offers a compelling alternative to manual bit manipulation—proving that safe abstractions and peak performance are not mutually exclusive. diff --git a/docs/_posts/2026-02-17-cpu-bug.markdown b/docs/_posts/2026-02-17-cpu-bug.markdown new file mode 100644 index 000000000000..7147ca74dc6b --- /dev/null +++ b/docs/_posts/2026-02-17-cpu-bug.markdown @@ -0,0 +1,46 @@ +--- +title: "RocksDB development finds a CPU bug" +layout: post +author: pdillinger +category: blog +--- + +This is the story of how a RocksDB unit test I added four years ago, a mini-stress test you might call it, revealed [a novel hardware bug in a newer CPU](https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7055.html). It was scary enough to be assigned a "high severity" CVE. + +## Background: Unique Identifiers +About four years ago, we [added unique identifiers to SST files](https://github.com/facebook/rocksdb/pull/9126) to give them stable identifiers across different filesystems for caching purposes. Part of the motivation here was to eliminate our dependence on the uniqueness and non-recycling of unique identifiers on files provided by the OS filesystem. (Some filesystems were only [guaranteeing uniqueness among existing files, not among all files even in recent history](https://github.com/facebook/rocksdb/issues/7405#issuecomment-694595587).) I would call this dependency problem the *great tension* between reusing existing solutions and code self-reliance. You don't want to duplicate others' work but you also don't want to be subject to their bugs or changing / misaligned requirements. Striking this balance can be tricky, but in this case it was clear to us that we didn't want to rely on all the possible filesystems providing quality unique identifiers. + +If you're comfortable with large random numbers (e.g. 128 bits), you probably agree that persisting random identifiers (or [quasi-random](https://github.com/pdillinger/unique_id/blob/main/README.md), which [I helped formalize in a paper](https://dl.acm.org/doi/10.1145/3584372.3588674), [also on arXiv](https://arxiv.org/abs/2304.07109)) with each file would be safer and more predictable than relying so crucially on a minor feature of OS filesystems. + +## High Quality Randomness +However, that assumes we have access to *high quality* random numbers (at least a good one or two to start from - see the paper). Because RocksDB intends to be cross-platform, we want to minimize platform-specific dependencies and prefer cross-platform dependencies. But that could easily land us back where we didn't want to be: susceptible to a bug or hiccup in one implementation of what we needed. + +Fortunately, the nature of random entropy allows *combining* sources so that your result is as good as your *best* input source, so even if one is bad, you only have a problem if they're all bad. And we had the advantages that (a) we only needed uniqueness, not security, which reduced the need for extra scrutiny and allowed us to use the quasi-random approach, and (b) the quasi-random approach minimized the amount of entropy needed, so the performance cost of acquiring each unit of entropy was almost inconsequential. Therefore, I combined these sources of entropy: + +* C++11's [std::random_device](https://en.cppreference.com/w/cpp/numeric/random/random_device.html) which is supposed to provide high quality but is allowed not to. +* A hash of various environment parameters including hostname, process id, thread id, and various macro and micro time readings. +* Platform-specific UUID generator (Linux and Windows only) + +## Trust But Verify +To verify the quality of each of these sources on an ongoing basis, [I added unit tests](https://github.com/facebook/rocksdb/pull/8708) that used many threads to create thousands of unique identifiers based on one of the above sources at a time and verified their uniqueness. For a high quality source, the probability of any duplicate 128-bit IDs among thousands is negligible, even if running these tests continuously for decades. + +## That's Weird +That was pretty much the story until some months ago the test based on `std::random_device` failed, once. It was quite suspicious because the number of unique IDs was not just one short of expectation, it was dozens or hundreds short. However, even that could be explained by a random CPU hiccup or bit flip in which we generated fewer IDs to begin with. (You might have noticed an increasing amount of RocksDB development effort and portion of CPU time going into checks that are logically redundant but exist to detect CPU miscalculations before the corruption propagates too far.) + +But then it failed again about a month later. No failures for four years, then two failures in two months. This smelled really bad. Digging into the details I noticed a crucial correlation: both of the failed test jobs had run on the same type of hardware, though in completely different data centers. + +From there I did the natural thing for an engineer: scale it up to try to reproduce the failure. And that was remarkably easy. By increasing the number of threads in the job to around the number of cores it would fail quickly and consistently on all systems using the same type of newer CPU, and pass on everything else. I tested some variants of this to establish some more details, including + +* `std::random_device` using "rdrand" and "/dev/urandom" sources were not affected, and +* libc++ (from clang) was not affected, only libstdc++ (from GCC) + +## Root Cause Analysis +From there Meta colleagues investigated the low-level details. They found the problem to be that the RDSEED instruction on this type of processor would return 0 and "success" much more often than would randomly be expected, but only on some cores and only under "complex micro-architectural conditions reproducible under memory-load," as a colleague describes it. A mitigating Linux kernel patch was developed to signal that RDSEED was unavailable on these processors, with the intention of rolling it out internally at Meta to avoid problems until a fix came from the OEM. [AMD quickly acknowledged the issue and announced planned mitigation](https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7055.html), including a CPU microcode update. + +## With Apologies +Although I worked to keep the information confidential until the OEM publicly acknowledged the issue, the uncoordinated disclosure via the Linux mailing list was due to zealous remediation efforts that crossed multiple infrastructure teams at Meta. We regret the mistake and are working to improve controls on the processes that failed to coordinate with the OEM first. + +## Key Takeaways +* Test what you depend on. +* Have redundancies and/or sanity checks for what you depend on. +* Even CPUs can have bugs, usually flaky individual units but occasionally a bug affecting all units. diff --git a/docs/static/images/parallel-compression/ring-buffer-architecture.svg b/docs/static/images/parallel-compression/ring-buffer-architecture.svg new file mode 100644 index 000000000000..75ee489cf243 --- /dev/null +++ b/docs/static/images/parallel-compression/ring-buffer-architecture.svg @@ -0,0 +1,136 @@ + + + + + + + + + + + + + + + + + + + + + + + Ring Buffer Architecture (8 slots shown) for Parallel Compression + + + + + + + + + Slot 0 + Empty + + + + Slot 1 + Writing... + + + + Slot 2 + Compressed + + + + Slot 3 + Compressing... + + + + Slot 4 + Uncompressed + + + + Slot 5 + Adding block... + + + + Slot 6 + Empty + + + + Slot 7 + Empty + + + + + + NextToWrite=1 + + + + NextToCompress=4 + + + + NextToEmit=5 + + + + + Worker Thread 2 + Currently writing + Can also compress + + + + + + Worker Thread 1 + Currently compressing + Can also write + + + + + + Emit Thread + Generates uncompressed blocks + Can help with compression + + + + + + + + + + + + + + + + SST File + + + + + + + + Invariant: + NextToWrite ≤ NextToCompress ≤ NextToEmit (modulo ring buffer size) + + diff --git a/env/composite_env.cc b/env/composite_env.cc index 59434785ced5..a0a4d9edf66d 100644 --- a/env/composite_env.cc +++ b/env/composite_env.cc @@ -100,6 +100,10 @@ class CompositeRandomAccessFileWrapper : public RandomAccessFile { return target_->InvalidateCache(offset, length); } + Status GetFileSize(uint64_t* size) override { + return target_->GetFileSize(size); + } + private: std::unique_ptr target_; }; diff --git a/env/env.cc b/env/env.cc index 683771e72360..80d65cced3a5 100644 --- a/env/env.cc +++ b/env/env.cc @@ -9,6 +9,7 @@ #include "rocksdb/env.h" +#include #include #include "env/composite_env_wrapper.h" @@ -26,6 +27,7 @@ #include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/options_type.h" #include "util/autovector.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { namespace { @@ -186,6 +188,10 @@ class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { IOStatus InvalidateCache(size_t offset, size_t length) override { return status_to_io_status(target_->InvalidateCache(offset, length)); } + IOStatus GetFileSize(uint64_t* result) override { + auto status = target_->GetFileSize(result); + return status_to_io_status(std::move(status)); + } private: std::unique_ptr target_; @@ -732,6 +738,48 @@ std::string Env::PriorityToString(Env::Priority priority) { return "Invalid"; } +std::string Env::IOActivityToString(IOActivity activity) { + switch (activity) { + case Env::IOActivity::kFlush: + return "Flush"; + case Env::IOActivity::kCompaction: + return "Compaction"; + case Env::IOActivity::kDBOpen: + return "DBOpen"; + case Env::IOActivity::kGet: + return "Get"; + case Env::IOActivity::kMultiGet: + return "MultiGet"; + case Env::IOActivity::kDBIterator: + return "DBIterator"; + case Env::IOActivity::kVerifyDBChecksum: + return "VerifyDBChecksum"; + case Env::IOActivity::kVerifyFileChecksums: + return "VerifyFileChecksums"; + case Env::IOActivity::kGetEntity: + return "GetEntity"; + case Env::IOActivity::kMultiGetEntity: + return "MultiGetEntity"; + case Env::IOActivity::kGetFileChecksumsFromCurrentManifest: + return "GetFileChecksumsFromCurrentManifest"; + case Env::IOActivity::kUnknown: + return "Unknown"; + default: + int activityIndex = static_cast(activity); + if (activityIndex >= + static_cast(Env::IOActivity::kFirstCustomIOActivity) && + activityIndex <= + static_cast(Env::IOActivity::kLastCustomIOActivity)) { + std::stringstream ss; + ss << std::hex << std::uppercase << activityIndex; + return "CustomIOActivity" + ss.str(); + } + return "Invalid"; + }; + assert(false); + return "Invalid"; +} + uint64_t Env::GetThreadID() const { std::hash hasher; return hasher(std::this_thread::get_id()); diff --git a/env/env_encryption.cc b/env/env_encryption.cc index 16a3c32819f0..9565b9d9bc90 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -665,17 +665,52 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { std::unique_ptr* result, IODebugContext* dbg) override { result->reset(); - if (options.use_mmap_writes) { + if (options.use_mmap_reads || options.use_mmap_writes) { return IOStatus::InvalidArgument(); } + + size_t prefix_length = 0; + std::unique_ptr stream; + // Open file using underlying Env implementation std::unique_ptr underlying; - IOStatus status = + auto status = FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } - return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); + + if (underlying->GetFileSize(options.io_options, dbg) != 0) { + // read the cipher stream from file for non-empty file + std::unique_ptr underlying_file_reader; + status = FileSystemWrapper::NewRandomAccessFile( + fname, options, &underlying_file_reader, dbg); + if (!status.ok()) { + return status; + } + + status = CreateRandomReadCipherStream( + fname, underlying_file_reader, options, &prefix_length, &stream, dbg); + + if (!status.ok()) { + return status; + } + } else { + // create cipher stream for new or empty file + status = CreateWritableCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (!status.ok()) { + return status; + } + } + + if (stream) { + result->reset(new EncryptedWritableFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } + return status; } IOStatus ReuseWritableFile(const std::string& fname, diff --git a/env/env_posix.cc b/env/env_posix.cc index 8b24a7a27888..86a7741f0f34 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -169,8 +169,9 @@ class PosixClock : public SystemClock { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); return (static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000; -#endif +#else return 0; +#endif } uint64_t CPUNanos() override { @@ -179,8 +180,9 @@ class PosixClock : public SystemClock { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#endif +#else return 0; +#endif } void SleepForMicroseconds(int micros) override { usleep(micros); } diff --git a/env/env_test.cc b/env/env_test.cc index e89f48531dc1..4c0939ecffa4 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -41,6 +41,9 @@ #include "env/env_chroot.h" #include "env/env_encryption_ctr.h" #include "env/fs_readonly.h" +#if defined(ROCKSDB_IOURING_PRESENT) +#include "env/io_posix.h" +#endif #include "env/mock_env.h" #include "env/unique_id_gen.h" #include "logging/log_buffer.h" @@ -1655,42 +1658,6 @@ void GenerateFilesAndRequest(Env* env, const std::string& fname, } } -TEST_F(EnvPosixTest, MultiReadIOUringError) { - // In this test we don't do aligned read, so we can't do direct I/O. - EnvOptions soptions; - soptions.use_direct_reads = soptions.use_direct_writes = false; - std::string fname = test::PerThreadDBPath(env_, "testfile"); - - std::vector scratches; - std::vector reqs; - GenerateFilesAndRequest(env_, fname, &reqs, &scratches); - // Query the data - std::unique_ptr file; - ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); - - bool io_uring_wait_cqe_called = false; - SyncPoint::GetInstance()->SetCallBack( - "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", - [&](void* arg) { - if (!io_uring_wait_cqe_called) { - io_uring_wait_cqe_called = true; - ssize_t& ret = *(static_cast(arg)); - ret = 1; - } - }); - SyncPoint::GetInstance()->EnableProcessing(); - - Status s = file->MultiRead(reqs.data(), reqs.size()); - if (io_uring_wait_cqe_called) { - ASSERT_NOK(s); - } else { - s.PermitUncheckedError(); - } - - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); -} - TEST_F(EnvPosixTest, MultiReadIOUringError2) { // In this test we don't do aligned read, so we can't do direct I/O. EnvOptions soptions; @@ -1706,19 +1673,20 @@ TEST_F(EnvPosixTest, MultiReadIOUringError2) { bool io_uring_submit_and_wait_called = false; SyncPoint::GetInstance()->SetCallBack( - "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", + "PosixRandomAccessFile::MultiRead:io_uring_sq_ready:return1", [&](void* arg) { io_uring_submit_and_wait_called = true; - ssize_t* ret = static_cast(arg); - (*ret)--; + unsigned* ret = static_cast(arg); + *ret = 1; }); SyncPoint::GetInstance()->SetCallBack( "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", [&](void* arg) { struct io_uring* iu = static_cast(arg); struct io_uring_cqe* cqe; - assert(io_uring_wait_cqe(iu, &cqe) == 0); - io_uring_cqe_seen(iu, cqe); + // CQ should be empty after drain - peek should fail + int ret = io_uring_peek_cqe(iu, &cqe); + assert(-EAGAIN == ret); // No CQEs available }); SyncPoint::GetInstance()->EnableProcessing(); @@ -2540,7 +2508,7 @@ TEST_P(EnvFSTestWithParam, OptionsTest) { } } for (int i = 0; i < 2; ++i) { - DB* db; + std::unique_ptr db; Status s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -2558,7 +2526,7 @@ TEST_P(EnvFSTestWithParam, OptionsTest) { ASSERT_EQ("b", val); ASSERT_OK(db->Close()); - delete db; + db.reset(); ASSERT_OK(DestroyDB(dbname, opts)); dbname = dbname2_; @@ -3467,7 +3435,6 @@ class ReadAsyncRandomAccessFile : public FSRandomAccessFileOwnerWrapper { private: ReadAsyncFS& fs_; - std::unique_ptr file_; int counter = 0; }; @@ -3641,6 +3608,486 @@ TEST_F(TestAsyncRead, ReadAsync) { } } +// Test ReadAsync -> MultiRead -> Poll with real io_uring (not mock). +// This verifies that MultiRead doesn't interfere with async read buffers. +TEST_F(TestAsyncRead, InterleavingIOUringOperations) { +#if defined(ROCKSDB_IOURING_PRESENT) + // Use the real filesystem directly (not the mock ReadAsyncFS). + std::shared_ptr fs = env_->GetFileSystem(); + std::string fname = test::PerThreadDBPath(env_, "testfile_iouring"); + + constexpr size_t kSectorSize = 4096; + constexpr size_t kNumSectors = 8; + + // 1. Create & write to a file. + { + std::unique_ptr wfile; + ASSERT_OK( + fs->NewWritableFile(fname, FileOptions(), &wfile, nullptr /*dbg*/)); + + for (size_t i = 0; i < kNumSectors; ++i) { + auto data = NewAligned(kSectorSize * 8, static_cast(i + 1)); + Slice slice(data.get(), kSectorSize); + ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr)); + } + ASSERT_OK(wfile->Close(IOOptions(), nullptr)); + } + + // 2. Test interleaved ReadAsync and MultiRead operations. + { + std::unique_ptr file; + ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr)); + + IOOptions opts; + std::vector io_handles(kNumSectors); + std::vector async_reqs(kNumSectors); + std::vector> async_data; + std::vector vals; + IOHandleDeleter del_fn; + + // Initialize async read requests. + for (size_t i = 0; i < kNumSectors; i++) { + async_reqs[i].offset = i * kSectorSize; + async_reqs[i].len = kSectorSize; + async_data.emplace_back(NewAligned(kSectorSize, 0)); + async_reqs[i].scratch = async_data.back().get(); + vals.push_back(i); + } + + // Callback function for async reads. + std::function callback = + [&](FSReadRequest& req, void* cb_arg) { + assert(cb_arg != nullptr); + size_t i = *(reinterpret_cast(cb_arg)); + async_reqs[i].offset = req.offset; + async_reqs[i].result = req.result; + async_reqs[i].status = req.status; + }; + + // Submit asynchronous read requests. + for (size_t i = 0; i < kNumSectors; i++) { + void* cb_arg = static_cast(&(vals[i])); + IOStatus s = file->ReadAsync(async_reqs[i], opts, callback, cb_arg, + &(io_handles[i]), &del_fn, nullptr); + if (s.IsNotSupported()) { + // io_uring not supported on this system, skip the test. + fprintf(stderr, "Skipping test - io_uring not supported: %s\n", + s.ToString().c_str()); + for (size_t j = 0; j < i; j++) { + if (io_handles[j] != nullptr) { + del_fn(io_handles[j]); + } + } + return; + } + // For any other error, fail the test. + ASSERT_OK(s); + } + + // Do a MultiRead on same sectors while async reads are submitted. + std::vector multi_reqs(kNumSectors); + std::vector> multi_data; + for (size_t i = 0; i < kNumSectors; i++) { + multi_reqs[i].offset = i * kSectorSize; + multi_reqs[i].len = kSectorSize; + multi_data.emplace_back(NewAligned(kSectorSize, 0)); + multi_reqs[i].scratch = multi_data.back().get(); + } + ASSERT_OK(file->MultiRead(multi_reqs.data(), kNumSectors, opts, nullptr)); + + // Check the status of MultiRead requests (should all succeed). + for (size_t i = 0; i < kNumSectors; i++) { + auto buf = NewAligned(kSectorSize * 8, static_cast(i + 1)); + Slice expected_data(buf.get(), kSectorSize); + + ASSERT_EQ(multi_reqs[i].offset, i * kSectorSize); + ASSERT_OK(multi_reqs[i].status); + ASSERT_EQ(expected_data.ToString(), multi_reqs[i].result.ToString()); + } + + // Poll for the submitted async requests. + ASSERT_OK(fs->Poll(io_handles, kNumSectors)); + + // Check the status of async read requests (should all succeed). + for (size_t i = 0; i < kNumSectors; i++) { + auto buf = NewAligned(kSectorSize * 8, static_cast(i + 1)); + Slice expected_data(buf.get(), kSectorSize); + + ASSERT_EQ(async_reqs[i].offset, i * kSectorSize); + ASSERT_OK(async_reqs[i].status); + ASSERT_EQ(expected_data.ToString(), async_reqs[i].result.ToString()); + } + + // Delete io_handles. + for (size_t i = 0; i < io_handles.size(); i++) { + del_fn(io_handles[i]); + } + } +#else + fprintf(stderr, "Skipping test - ROCKSDB_IOURING_PRESENT not defined\n"); +#endif +} + +// Helper function to run AbortIO test with parameterized read requests. +// Each request is specified as {offset, length}. +// use_direct_io: if true, opens the file with O_DIRECT to bypass page cache. +// iterations: number of times to repeat the test (useful for race conditions). +void TestAbortIOWithRequests( + Env* env, size_t file_size, + const std::vector>& read_specs, + bool use_direct_io = false, int iterations = 1) { +#if defined(ROCKSDB_IOURING_PRESENT) + fprintf(stderr, + "TestAbortIOWithRequests: file_size=%zu, num_reads=%zu, " + "direct_io=%d, iterations=%d\n", + file_size, read_specs.size(), use_direct_io, iterations); + std::shared_ptr fs = env->GetFileSystem(); + std::string fname = test::PerThreadDBPath(env, "testfile_abortio"); + + // 1. Create test file once (content doesn't change between iterations) + { + std::unique_ptr wfile; + FileOptions file_opts; + file_opts.use_direct_writes = true; + ASSERT_OK(fs->NewWritableFile(fname, file_opts, &wfile, nullptr)); + + // Query the file's required buffer alignment (logical block size) + // instead of hardcoding 4096, to support devices with different + // sector sizes. + size_t sector_size = wfile->GetRequiredBufferAlignment(); + + // Round up to full sectors for direct IO writes + size_t num_sectors = (file_size + sector_size - 1) / sector_size; + for (size_t i = 0; i < num_sectors; ++i) { + auto data = NewAligned(sector_size, static_cast(i + 1)); + Slice slice(data.get(), sector_size); + ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr)); + } + + // Truncate to exact file size if not aligned to sector boundary + if (file_size % sector_size != 0) { + ASSERT_OK(wfile->Truncate(file_size, IOOptions(), nullptr)); + } + + ASSERT_OK(wfile->Close(IOOptions(), nullptr)); + } + + for (int iter = 0; iter < iterations; iter++) { + // 2. Submit ReadAsync requests and immediately abort + { + FileOptions file_opts; + file_opts.use_direct_reads = use_direct_io; + std::unique_ptr file; + ASSERT_OK(fs->NewRandomAccessFile(fname, file_opts, &file, nullptr)); + + const size_t num_reads = read_specs.size(); + IOOptions opts; + std::vector io_handles(num_reads); + std::vector reqs(num_reads); + std::vector> data; + std::vector vals; + IOHandleDeleter del_fn; + std::atomic callbacks_invoked{0}; + + // Initialize read requests from specs + for (size_t i = 0; i < num_reads; i++) { + reqs[i].offset = read_specs[i].first; + reqs[i].len = read_specs[i].second; + data.emplace_back(NewAligned(reqs[i].len, 0)); + reqs[i].scratch = data.back().get(); + vals.push_back(i); + } + + // Callback + std::function callback = + [&](FSReadRequest& req, void* cb_arg) { + size_t i = *(reinterpret_cast(cb_arg)); + reqs[i].status = req.status; + callbacks_invoked++; + }; + + // Submit all ReadAsync requests + for (size_t i = 0; i < num_reads; i++) { + void* cb_arg = static_cast(&(vals[i])); + IOStatus s = file->ReadAsync(reqs[i], opts, callback, cb_arg, + &(io_handles[i]), &del_fn, nullptr); + if (s.IsNotSupported()) { + // io_uring not supported, clean up and skip + fprintf(stderr, + "WARNING: io_uring not supported, skipping test: %s\n", + s.ToString().c_str()); + for (size_t j = 0; j < i; j++) { + if (io_handles[j]) { + del_fn(io_handles[j]); + } + } + ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr)); + return; + } + ASSERT_OK(s); + } + + // Immediately call AbortIO - this should NOT hang + ASSERT_OK(fs->AbortIO(io_handles)); + + // Verify all handles are finished and all callbacks were invoked. + // Since all handles are passed to AbortIO, every handle is guaranteed + // to be finalized (either completed or cancelled). + for (size_t i = 0; i < num_reads; i++) { + Posix_IOHandle* h = static_cast(io_handles[i]); + ASSERT_TRUE(h->is_finished); + } + ASSERT_EQ(callbacks_invoked.load(), static_cast(num_reads)); + + // Clean up handles + for (size_t i = 0; i < num_reads; i++) { + if (io_handles[i]) { + del_fn(io_handles[i]); + } + } + } + } + + ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr)); + + fprintf(stderr, "TestAbortIOWithRequests: completed %d iterations\n", + iterations); +#else + fprintf(stderr, + "TestAbortIOWithRequests: SKIPPED (ROCKSDB_IOURING_PRESENT not " + "defined)\n"); + (void)env; + (void)file_size; + (void)read_specs; + (void)use_direct_io; + (void)iterations; +#endif +} + +// Test overlapping reads at aligned offsets (multiples of 4KB) +TEST_F(TestAsyncRead, AbortIOOverlappingAligned) { + // 4 reads of 16KB each, overlapping by 8KB, all at 4KB-aligned offsets + // Read 0: [0, 16KB), Read 1: [8KB, 24KB), Read 2: [16KB, 32KB), Read 3: + // [24KB, 40KB) + std::vector> specs = { + {0, 16384}, + {8192, 16384}, + {16384, 16384}, + {24576, 16384}, + }; + TestAbortIOWithRequests(env_, 64 * 1024, specs); +} + +// Test reads at unaligned offsets (not multiples of 4KB) +TEST_F(TestAsyncRead, AbortIOUnalignedOffsets) { + // Reads starting at non-4KB-aligned offsets + std::vector> specs = { + {1000, 8192}, // starts at 1000 (unaligned) + {5000, 12288}, // starts at 5000 (unaligned), spans multiple sectors + {15000, 8192}, // starts at 15000 (unaligned) + {25500, 16384}, // starts at 25500 (unaligned) + }; + TestAbortIOWithRequests(env_, 64 * 1024, specs); +} + +// Test mix of aligned and unaligned, various sizes +TEST_F(TestAsyncRead, AbortIOMixedOffsets) { + std::vector> specs = { + {0, 4096}, // aligned, 1 sector + {1500, 8192}, // unaligned, 2 sectors + {4096, 20480}, // aligned, 5 sectors + {7000, 4096}, // unaligned, spans 2 sectors + {16384, 32768}, // aligned, 8 sectors + {50000, 8192}, // unaligned + }; + TestAbortIOWithRequests(env_, 128 * 1024, specs); +} + +// Stress test with many concurrent handles +TEST_F(TestAsyncRead, AbortIOStress) { + std::vector> specs; + // 16 overlapping reads with mixed alignment + for (int i = 0; i < 16; i++) { + uint64_t offset = i * 4000; // Not aligned to 4KB + size_t len = 8192 + (i % 4) * 4096; // 8KB to 20KB + specs.emplace_back(offset, len); + } + TestAbortIOWithRequests(env_, 256 * 1024, specs); +} + +// Regression test for a fixed bug in AbortIO where out-of-order io_uring +// completions could cause an infinite hang. The bug occurred when completions +// for a different handle arrived while waiting for the current handle - the +// code would consume those completions but not mark the handle as finished, +// causing a hang when later iterating to that handle. +// +// Uses a large read (1MB) followed by a small read (4KB) with Direct I/O to +// maximize the chance of out-of-order completions. Runs 100 iterations to +// increase the likelihood of triggering the race condition. +TEST_F(TestAsyncRead, AbortIOReversedHandles) { + // Request 0: LARGE (1MB) at offset 0 + // Request 1: SMALL (4KB) at offset 1MB + std::vector> specs = { + {0, 1024 * 1024}, // 1MB read + {1024 * 1024, 4096}, // 4KB read at 1MB offset + }; + // 2MB file, Direct I/O enabled, 100 iterations + TestAbortIOWithRequests(env_, 2 * 1024 * 1024, specs, + /*use_direct_io=*/true, /*iterations=*/100); +} + +// Test for bug fix: AbortIO with partial handles should correctly handle +// completions for non-aborted handles. +// +// Previously, AbortIO would consume completions for non-aborted handles but +// not set is_finished (since it expected req_count==2 for all handles). +// This caused subsequent Poll calls to hang forever. +// +// The fix correctly detects handles not in the abort set and finalizes them +// immediately when their completion arrives (at req_count==1). +TEST_F(TestAsyncRead, AbortIOPartialHandlesBug) { +#if defined(ROCKSDB_IOURING_PRESENT) + std::shared_ptr fs = env_->GetFileSystem(); + std::string fname = test::PerThreadDBPath(env_, "testfile_abortio_partial"); + + constexpr size_t kSectorSize = 4096; + constexpr size_t kFileSize = 2 * 1024 * 1024; // 2MB + + // 1. Create test file with direct I/O + { + std::unique_ptr wfile; + FileOptions file_opts; + file_opts.use_direct_writes = true; + ASSERT_OK(fs->NewWritableFile(fname, file_opts, &wfile, nullptr)); + + size_t num_sectors = kFileSize / kSectorSize; + for (size_t i = 0; i < num_sectors; ++i) { + auto data = NewAligned(kSectorSize, static_cast(i + 1)); + Slice slice(data.get(), kSectorSize); + ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr)); + } + ASSERT_OK(wfile->Close(IOOptions(), nullptr)); + } + + // 2. Submit 3 ReadAsync requests, abort only the first one, then Poll the + // rest + { + FileOptions file_opts; + file_opts.use_direct_reads = true; + std::unique_ptr file; + ASSERT_OK(fs->NewRandomAccessFile(fname, file_opts, &file, nullptr)); + + IOOptions opts; + constexpr size_t kNumReads = 3; + std::vector io_handles(kNumReads); + std::vector reqs(kNumReads); + std::vector> data; + std::vector vals; + IOHandleDeleter del_fn; + std::atomic callbacks_invoked{0}; + + // H0: 1MB read, H1: 4KB read, H2: 4KB read + std::vector> read_specs = { + {0, 1024 * 1024}, // H0: 1MB at offset 0 + {1024 * 1024, 4096}, // H1: 4KB at offset 1MB + {1024 * 1024 + 4096, 4096}, // H2: 4KB at offset 1MB+4KB + }; + + for (size_t i = 0; i < kNumReads; i++) { + reqs[i].offset = read_specs[i].first; + reqs[i].len = read_specs[i].second; + data.emplace_back(NewAligned(reqs[i].len, 0)); + reqs[i].scratch = data.back().get(); + vals.push_back(i); + } + + std::function callback = + [&](FSReadRequest& req, void* cb_arg) { + size_t i = *(reinterpret_cast(cb_arg)); + reqs[i].status = req.status; + callbacks_invoked++; + }; + + // Submit all ReadAsync requests + for (size_t i = 0; i < kNumReads; i++) { + void* cb_arg = static_cast(&(vals[i])); + IOStatus s = file->ReadAsync(reqs[i], opts, callback, cb_arg, + &(io_handles[i]), &del_fn, nullptr); + if (s.IsNotSupported()) { + // io_uring not supported, clean up and skip + for (size_t j = 0; j < i; j++) { + if (io_handles[j]) { + del_fn(io_handles[j]); + } + } + ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr)); + return; + } + ASSERT_OK(s); + } + + // Wait for reads to complete in io_uring (completions in queue but not + // consumed). 5 seconds should be plenty for direct I/O reads to complete. + std::this_thread::sleep_for(std::chrono::seconds(5)); + + // Abort ONLY H0 - this will consume all completions but should correctly + // finalize H1 and H2 (since they're not in the abort set). + std::vector abort_handles = {io_handles[0]}; + ASSERT_OK(fs->AbortIO(abort_handles)); + + // Verify H0 is finished (aborted) + Posix_IOHandle* h0 = static_cast(io_handles[0]); + ASSERT_TRUE(h0->is_finished); + ASSERT_EQ(h0->req_count, 2u); // original + cancel + + // Note: H1 and H2 may or may not be finished at this point. AbortIO + // finalizes non-aborted handles whose CQEs arrive while waiting for + // aborted handles, but CQE ordering is non-deterministic. If H0's + // completions arrived first, H1/H2's CQEs are still in the queue. + // Poll handles either case correctly. + + // Poll on H1, H2 - completes them if not already finalized by AbortIO + std::vector poll_handles = {io_handles[1], io_handles[2]}; + + // Use a watchdog to detect hang (regression test for the original bug + // where AbortIO consumed non-aborted CQEs without finalizing them) + std::atomic poll_completed{false}; + std::thread watchdog([&]() { + for (int i = 0; i < 500; i++) { // 5 seconds timeout + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + if (poll_completed) return; + } + // Bug regression: Poll hung + _exit(1); + }); + + fs->Poll(poll_handles, poll_handles.size()); + poll_completed = true; + watchdog.join(); + + // After Poll, H1 and H2 must be finished + Posix_IOHandle* h1 = static_cast(io_handles[1]); + Posix_IOHandle* h2 = static_cast(io_handles[2]); + ASSERT_TRUE(h1->is_finished); + ASSERT_TRUE(h2->is_finished); + + // Verify all callbacks were invoked + ASSERT_EQ(callbacks_invoked.load(), 3); + + // Clean up handles + for (size_t i = 0; i < kNumReads; i++) { + if (io_handles[i]) { + del_fn(io_handles[i]); + } + } + } + + ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr)); +#else + (void)env_; // Suppress unused variable warning +#endif +} + struct StaticDestructionTester { bool activated = false; ~StaticDestructionTester() { @@ -3657,6 +4104,60 @@ TEST(EnvTestMisc, StaticDestruction) { static_destruction_tester.activated = true; } +// Test GetFileSize API +class TestGetFileSize : public testing::Test { + public: + TestGetFileSize() { env_ = Env::Default(); } + Env* env_; +}; + +// Validate GetFileSize API returns the right value. +// Use the default implementation from env +TEST_F(TestGetFileSize, GetFileSize) { + EnvOptions soptions; + auto fs = env_->GetFileSystem(); + + std::string fname = test::PerThreadDBPath(env_, "getFileSizeTestfile"); + + // randomize file size + auto rnd = Random::GetTLSInstance(); + auto expectedFileSize = rnd->Uniform(256 * 1024) + 1; + auto content = rnd->RandomBinaryString(static_cast(expectedFileSize)); + + ASSERT_OK(CreateFile(fs.get(), fname, content, false)); + + std::unique_ptr file; + ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr)); + + uint64_t fileSizeFromFileSystemAPI; + ASSERT_OK( + fs->GetFileSize(fname, IOOptions(), &fileSizeFromFileSystemAPI, nullptr)); + ASSERT_EQ(fileSizeFromFileSystemAPI, expectedFileSize); + + uint64_t fileSizeFromFsRandomAccessFileAPI; + ASSERT_OK(file->GetFileSize(&fileSizeFromFsRandomAccessFileAPI)); + + ASSERT_EQ(fileSizeFromFsRandomAccessFileAPI, expectedFileSize); +} + +class TestIOActivity : public testing::Test { + public: + TestIOActivity() {} +}; + +TEST_F(TestIOActivity, IOActivityToString) { + ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kMultiGet), "MultiGet"); + + ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivity80), + "CustomIOActivity80"); + ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivityA9), + "CustomIOActivityA9"); + ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivityFE), + "CustomIOActivityFE"); + + ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kUnknown), "Unknown"); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/env/file_system_tracer.cc b/env/file_system_tracer.cc index dc44107b58c9..46fe4ce7491b 100644 --- a/env/file_system_tracer.cc +++ b/env/file_system_tracer.cc @@ -355,9 +355,11 @@ IOStatus FSRandomAccessFileTracingWrapper::ReadAsync( IOStatus s = target()->ReadAsync(req, opts, read_async_callback, read_async_cb_info, io_handle, del_fn, dbg); +#ifndef __clang_analyzer__ if (!s.ok()) { delete read_async_cb_info; } +#endif // __clang_analyzer__ return s; } diff --git a/env/fs_posix.cc b/env/fs_posix.cc index 82fb9fba337b..14b34ca6920d 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -243,7 +243,7 @@ class PosixFileSystem : public FileSystem { // Use mmap when virtual address-space is plentiful. uint64_t size; IOOptions opts; - s = GetFileSize(fname, opts, &size, nullptr); + s = GetFileSizeOnOpenedFile(fd, fname, &size); if (s.ok()) { void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0); if (base != MAP_FAILED) { @@ -270,7 +270,10 @@ class PosixFileSystem : public FileSystem { options #if defined(ROCKSDB_IOURING_PRESENT) , - !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get() + !IsIOUringEnabled() ? nullptr + : thread_local_async_read_io_urings_.get(), + !IsIOUringEnabled() ? nullptr + : thread_local_multi_read_io_urings_.get() #endif )); } @@ -322,8 +325,17 @@ class PosixFileSystem : public FileSystem { if (options.use_mmap_writes) { MaybeForceDisableMmap(fd); } + uint64_t initial_file_size = 0; + if (reopen) { + s = GetFileSizeOnOpenedFile(fd, fname, &initial_file_size); + if (!s.ok()) { + close(fd); + return s; + } + } if (options.use_mmap_writes && !forceMmapOff_) { - result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + result->reset( + new PosixMmapFile(fname, fd, page_size_, options, initial_file_size)); } else if (options.use_direct_writes && !options.use_mmap_writes) { #ifdef OS_MACOSX if (fcntl(fd, F_NOCACHE, 1) == -1) { @@ -343,7 +355,7 @@ class PosixFileSystem : public FileSystem { #endif result->reset(new PosixWritableFile( fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd), - options)); + options, initial_file_size)); } else { // disable mmap writes EnvOptions no_mmap_writes_options = options; @@ -352,7 +364,7 @@ class PosixFileSystem : public FileSystem { new PosixWritableFile(fname, fd, GetLogicalBlockSizeForWriteIfNeeded( no_mmap_writes_options, fname, fd), - no_mmap_writes_options)); + no_mmap_writes_options, initial_file_size)); } return s; } @@ -418,7 +430,8 @@ class PosixFileSystem : public FileSystem { MaybeForceDisableMmap(fd); } if (options.use_mmap_writes && !forceMmapOff_) { - result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + result->reset(new PosixMmapFile(fname, fd, page_size_, options, + /*initial_file_size=*/0)); } else if (options.use_direct_writes && !options.use_mmap_writes) { #ifdef OS_MACOSX if (fcntl(fd, F_NOCACHE, 1) == -1) { @@ -438,16 +451,16 @@ class PosixFileSystem : public FileSystem { #endif result->reset(new PosixWritableFile( fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd), - options)); + options, /*initial_file_size=*/0)); } else { // disable mmap writes FileOptions no_mmap_writes_options = options; no_mmap_writes_options.use_mmap_writes = false; - result->reset( - new PosixWritableFile(fname, fd, - GetLogicalBlockSizeForWriteIfNeeded( - no_mmap_writes_options, fname, fd), - no_mmap_writes_options)); + result->reset(new PosixWritableFile( + fname, fd, + GetLogicalBlockSizeForWriteIfNeeded(no_mmap_writes_options, fname, + fd), + no_mmap_writes_options, /*initial_file_size=*/0)); } return s; } @@ -499,7 +512,7 @@ class PosixFileSystem : public FileSystem { uint64_t size; if (status.ok()) { IOOptions opts; - status = GetFileSize(fname, opts, &size, nullptr); + status = GetFileSizeOnOpenedFile(fd, fname, &size); } void* base = nullptr; if (status.ok()) { @@ -661,7 +674,7 @@ class PosixFileSystem : public FileSystem { IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/, uint64_t* size, IODebugContext* /*dbg*/) override { - struct stat sbuf; + struct stat sbuf{}; if (stat(fname.c_str(), &sbuf) != 0) { *size = 0; return IOError("while stat a file for size", fname, errno); @@ -858,7 +871,6 @@ class PosixFileSystem : public FileSystem { IOOptions opts; return CreateDirIfMissing(*result, opts, nullptr); } - return IOStatus::OK(); } IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/, @@ -965,6 +977,22 @@ class PosixFileSystem : public FileSystem { private: bool forceMmapOff_ = false; // do we override Env options? + // This is a faster API comparing to the public method that uses stat to get + // file size. However this API only works on opened file. + IOStatus GetFileSizeOnOpenedFile(const int fd, const std::string& name, + uint64_t* size) { + struct stat sb{}; + *size = 0; + // Get file information using fstat + if (fstat(fd, &sb) == -1) { + return IOError( + "while fstat a file for size with fd " + std::to_string(fd), name, + errno); + } + *size = sb.st_size; + return IOStatus::OK(); + } + #ifdef OS_LINUX // Get the minimum "linux system limit" (i.e, the largest I/O size that the OS // can issue to block devices under a directory, also known as @@ -1062,8 +1090,9 @@ class PosixFileSystem : public FileSystem { #if defined(ROCKSDB_IOURING_PRESENT) // io_uring_queue_init. struct io_uring* iu = nullptr; - if (thread_local_io_urings_) { - iu = static_cast(thread_local_io_urings_->Get()); + if (thread_local_async_read_io_urings_) { + iu = static_cast( + thread_local_async_read_io_urings_->Get()); } // Init failed, platform doesn't support io_uring. @@ -1082,8 +1111,10 @@ class PosixFileSystem : public FileSystem { struct io_uring_cqe* cqe = nullptr; ssize_t ret = io_uring_wait_cqe(iu, &cqe); if (ret) { - // abort as it shouldn't be in indeterminate state and there is no - // good way currently to handle this error. + fprintf(stderr, "Poll: io_uring_wait_cqe failed: %ld", (long)ret); + if (ret == -EINTR || ret == -EAGAIN) { + continue; // Retry + } abort(); } @@ -1098,25 +1129,7 @@ class PosixFileSystem : public FileSystem { // Reset cqe data to catch any stray reuse of it static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; - FSReadRequest req; - req.scratch = posix_handle->scratch; - req.offset = posix_handle->offset; - req.len = posix_handle->len; - - size_t finished_len = 0; - size_t bytes_read = 0; - bool read_again = false; - UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, - true /*async_read*/, posix_handle->use_direct_io, - posix_handle->alignment, finished_len, &req, bytes_read, - read_again); - posix_handle->is_finished = true; - io_uring_cqe_seen(iu, cqe); - posix_handle->cb(req, posix_handle->cb_arg); - - (void)finished_len; - (void)bytes_read; - (void)read_again; + FinalizeAsyncRead(iu, cqe, posix_handle); if (static_cast(io_handles[i]) == posix_handle) { break; @@ -1126,7 +1139,7 @@ class PosixFileSystem : public FileSystem { return IOStatus::OK(); #else (void)io_handles; - return IOStatus::NotSupported("Poll"); + return IOStatus::NotSupported("Poll not implemented"); #endif } @@ -1134,8 +1147,9 @@ class PosixFileSystem : public FileSystem { #if defined(ROCKSDB_IOURING_PRESENT) // io_uring_queue_init. struct io_uring* iu = nullptr; - if (thread_local_io_urings_) { - iu = static_cast(thread_local_io_urings_->Get()); + if (thread_local_async_read_io_urings_) { + iu = static_cast( + thread_local_async_read_io_urings_->Get()); } // Init failed, platform doesn't support io_uring. @@ -1156,6 +1170,11 @@ class PosixFileSystem : public FileSystem { return IOStatus::IOError(""); } + // Mark this handle as being aborted. This is used when processing + // completions to distinguish between aborted handles (expect 2 + // completions: original + cancel) and non-aborted handles (expect 1). + posix_handle->is_being_aborted = true; + // Prepare the cancel request. struct io_uring_sqe* sqe; sqe = io_uring_get_sqe(iu); @@ -1185,8 +1204,10 @@ class PosixFileSystem : public FileSystem { struct io_uring_cqe* cqe = nullptr; ssize_t ret = io_uring_wait_cqe(iu, &cqe); if (ret) { - // abort as it shouldn't be in indeterminate state and there is no - // good way currently to handle this error. + fprintf(stderr, "AbortIO: io_uring_wait_cqe failed: %ld", (long)ret); + if (ret == -EINTR || ret == -EAGAIN) { + continue; // Retry + } abort(); } assert(cqe != nullptr); @@ -1200,6 +1221,14 @@ class PosixFileSystem : public FileSystem { } posix_handle->req_count++; + if (!posix_handle->is_being_aborted) { + // This is a completion for a handle NOT being aborted. + // It only has 1 outstanding request (the original read), so we + // should finalize it now. + FinalizeAsyncRead(iu, cqe, posix_handle); + continue; + } + // Reset cqe data to catch any stray reuse of it static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; io_uring_cqe_seen(iu, cqe); @@ -1213,16 +1242,23 @@ class PosixFileSystem : public FileSystem { // - And finally, if the request to cancel wasn't // found, the cancel request is completed with -ENOENT. // - // Every handle has to wait for 2 requests completion: original one and - // the cancel request which is tracked by PosixHandle::req_count. - if (posix_handle->req_count == 2 && - static_cast(io_handles[i]) == posix_handle) { + // Every handle being aborted has to wait for 2 requests completion: + // original one and the cancel request which is tracked by + // PosixHandle::req_count. + // Note: We must mark is_finished and invoke the callback for ANY handle + // that reaches req_count == 2, not just the one we're currently waiting + // for (io_handles[i]). Otherwise, if completions arrive out of order, + // we consume another handle's completions without marking it finished, + // causing an infinite hang when we later wait for that handle. + if (posix_handle->req_count == 2) { posix_handle->is_finished = true; FSReadRequest req; req.status = IOStatus::Aborted(); posix_handle->cb(req, posix_handle->cb_arg); - break; + if (static_cast(io_handles[i]) == posix_handle) { + break; + } } } } @@ -1238,16 +1274,18 @@ class PosixFileSystem : public FileSystem { void SupportedOps(int64_t& supported_ops) override { supported_ops = 0; #if defined(ROCKSDB_IOURING_PRESENT) - if (IsIOUringEnabled()) { + if (IsIOUringEnabled() && thread_local_async_read_io_urings_) { // Underlying FS supports async_io supported_ops |= (1 << FSSupportedOps::kAsyncIO); } #endif + supported_ops |= (1 << FSSupportedOps::kFSPrefetch); } #if defined(ROCKSDB_IOURING_PRESENT) // io_uring instance - std::unique_ptr thread_local_io_urings_; + std::unique_ptr thread_local_async_read_io_urings_; + std::unique_ptr thread_local_multi_read_io_urings_; #endif size_t page_size_; @@ -1302,12 +1340,12 @@ PosixFileSystem::PosixFileSystem() page_size_(getpagesize()), allow_non_owner_access_(true) { #if defined(ROCKSDB_IOURING_PRESENT) - // Test whether IOUring is supported, and if it does, create a managing - // object for thread local point so that in the future thread-local - // io_uring can be created. + // Test whether IOUring is supported with the same flags that ReadAsync and + // MultiRead will use at runtime. struct io_uring* new_io_uring = CreateIOUring(); if (new_io_uring != nullptr) { - thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring)); + thread_local_async_read_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring)); + thread_local_multi_read_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring)); delete new_io_uring; } #endif diff --git a/env/io_posix.cc b/env/io_posix.cc index 231e88daef39..a04e469cb91e 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -589,7 +589,8 @@ PosixRandomAccessFile::PosixRandomAccessFile( const EnvOptions& options #if defined(ROCKSDB_IOURING_PRESENT) , - ThreadLocalPtr* thread_local_io_urings + ThreadLocalPtr* thread_local_async_read_io_urings, + ThreadLocalPtr* thread_local_multi_read_io_urings #endif ) : filename_(fname), @@ -598,7 +599,8 @@ PosixRandomAccessFile::PosixRandomAccessFile( logical_sector_size_(logical_block_size) #if defined(ROCKSDB_IOURING_PRESENT) , - thread_local_io_urings_(thread_local_io_urings) + thread_local_async_read_io_urings_(thread_local_async_read_io_urings), + thread_local_multi_read_io_urings_(thread_local_multi_read_io_urings) #endif { assert(!options.use_direct_reads || !options.use_mmap_reads); @@ -607,6 +609,17 @@ PosixRandomAccessFile::PosixRandomAccessFile( PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); } +IOStatus PosixRandomAccessFile::GetFileSize(uint64_t* result) { + struct stat sbuf{}; + if (fstat(fd_, &sbuf) != 0) { + *result = 0; + return IOError("While fstat with fd " + std::to_string(fd_), filename_, + errno); + } + *result = sbuf.st_size; + return IOStatus::OK(); +} + IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n, const IOOptions& /*opts*/, Slice* result, char* scratch, @@ -648,6 +661,83 @@ IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n, return s; } +// MultiRead: Perform multiple concurrent read requests using io_uring. +// +// OVERVIEW: +// This function batches multiple read requests and submits them concurrently +// to io_uring for improved I/O performance. It operates synchronously from the +// caller's perspective (blocks until all reads complete) but uses io_uring's +// async capabilities internally for parallel I/O execution. +// +// IO_URING LIFECYCLE: +// 1. Preparation Phase: +// - Allocate SQEs (Submission Queue Entries) for read requests +// - Limited by: min(pending_work, io_uring_sq_space_left(), kIoUringDepth - +// inflight) +// - Uses io_uring_sq_space_left() to query available SQ slots +// - Each SQE is tracked in wrap_cache for completion matching +// +// 2. Submission Phase: +// - Loop: while io_uring_sq_ready() > 0 (SQEs pending submission) +// - Call io_uring_submit_and_wait() to submit SQEs and wait for CQEs +// - Handles retryable errors (EINTR, EAGAIN) by continuing +// - Breaks on terminal errors (logs error, sets err variable) +// +// 3. Completion Phase: +// - Non-blocking CQE reaping via io_uring_for_each_cqe() +// - Matches CQEs to requests using user_data pointer +// - Processes results: updates bytes read, handles partial reads +// - Removes completed requests from wrap_cache +// +// 4. Loop Iteration: +// - Repeats until: all requests submitted AND all completions reaped +// - Termination condition: (num_reqs == reqs_off) && +// resubmit_rq_list.empty() && wrap_cache.empty() +// +// ERROR HANDLING STRATEGY: +// - Retryable submission errors (-EINTR, -EAGAIN): Retry submission +// - Memory pressure (-ENOMEM): Mark memory_pressure_on_submission, attempt +// recovery +// - Terminal submission errors: Break, enter teardown path +// - Retryable CQE errors (-EINTR, -EAGAIN): Add to resubmit_rq_list for retry +// - Terminal CQE errors: Set ios to IOError, continue processing other CQEs +// - Teardown path: If SQEs remain unsubmitted after error, reap submitted CQEs, +// destroy io_uring instance, return error +// +// PARTIAL READ HANDLING: +// - Short reads (bytes_read < requested): Request added to resubmit_rq_list +// - finished_len tracks cumulative bytes read across resubmissions +// - iov.iov_base/iov_len adjusted on each resubmission attempt +// - UpdateResult() determines if read should be retried based on: +// * Direct I/O alignment requirements +// * EOF detection +// * Error conditions +// +// RESUBMISSION LOGIC: +// - resubmit_rq_list: Requests needing retry (short reads, EINTR/EAGAIN errors) +// - Prioritized in SQE allocation loop: resubmits before new requests +// - List cleared after SQE preparation +// - Requests remain in wrap_cache across resubmissions until fully complete +// +// CONCURRENCY CONTROL: +// - wrap_cache.size(): Tracks total inflight requests (SQ + CQ) +// - io_uring_sq_ready(): Queries SQEs prepared but not yet submitted +// - io_uring_sq_space_left(): Queries available SQ slots +// - Max concurrency: kIoUringDepth (256) +// +// ACCOUNTING CORRECTNESS: +// - Uses io_uring native APIs (io_uring_sq_ready, io_uring_sq_space_left) +// instead of manual counters for robustness +// - wrap_cache is the authoritative source for inflight request tracking +// - Re-query io_uring_sq_ready() after submission loop to detect +// unsubmitted SQEs (indicates submission errors) +// +// THREAD SAFETY: +// - Uses thread-local io_uring instance (thread_local_multi_read_io_urings_) +// - IORING_SETUP_SINGLE_ISSUER: Only one thread submits to this ring +// - IORING_SETUP_DEFER_TASKRUN: Task work runs in submitting thread +// - No cross-thread coordination required +// IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) { @@ -661,12 +751,13 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, #if defined(ROCKSDB_IOURING_PRESENT) struct io_uring* iu = nullptr; - if (thread_local_io_urings_) { - iu = static_cast(thread_local_io_urings_->Get()); + if (thread_local_multi_read_io_urings_) { + iu = static_cast( + thread_local_multi_read_io_urings_->Get()); if (iu == nullptr) { iu = CreateIOUring(); if (iu != nullptr) { - thread_local_io_urings_->Reset(iu); + thread_local_multi_read_io_urings_->Reset(iu); } } } @@ -677,8 +768,6 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); } - IOStatus ios = IOStatus::OK(); - struct WrappedReadRequest { FSReadRequest* req; struct iovec iov; @@ -687,118 +776,199 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, }; autovector req_wraps; - autovector incomplete_rq_list; + autovector resubmit_rq_list; std::unordered_set wrap_cache; for (size_t i = 0; i < num_reqs; i++) { req_wraps.emplace_back(&reqs[i]); } + IOStatus ios = IOStatus::OK(); size_t reqs_off = 0; - while (num_reqs > reqs_off || !incomplete_rq_list.empty()) { - size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size(); - - // If requests exceed depth, split it into batches - if (this_reqs > kIoUringDepth) { - this_reqs = kIoUringDepth; - } - - assert(incomplete_rq_list.size() <= this_reqs); - for (size_t i = 0; i < this_reqs; i++) { - WrappedReadRequest* rep_to_submit; - if (i < incomplete_rq_list.size()) { - rep_to_submit = incomplete_rq_list[i]; + while ((num_reqs > reqs_off) || !resubmit_rq_list.empty() || + !wrap_cache.empty()) { + assert(resubmit_rq_list.size() + wrap_cache.size() <= kIoUringDepth); + // Total number of requests that still need to be submitted, includes: + // + // 1) requests NOT yet submitted (num_reqs - reqs_off) + // 2) requests on resubmission list (resubmit_rq_list) + // + // capped by min of the # of remaining entries in IO ring submission queue + // and the max IO ring depth less the inflight requests. + size_t new_sqe_reqs_count = std::min({ + num_reqs - reqs_off + resubmit_rq_list.size(), + static_cast(io_uring_sq_space_left(iu)), + kIoUringDepth - wrap_cache.size() // queue depth less inflight requests + }); + for (size_t i = 0; i < new_sqe_reqs_count; i++) { + WrappedReadRequest* req; + if (i < resubmit_rq_list.size()) { + req = resubmit_rq_list[i]; } else { - rep_to_submit = &req_wraps[reqs_off++]; + req = &req_wraps[reqs_off++]; } - assert(rep_to_submit->req->len > rep_to_submit->finished_len); - rep_to_submit->iov.iov_base = - rep_to_submit->req->scratch + rep_to_submit->finished_len; - rep_to_submit->iov.iov_len = - rep_to_submit->req->len - rep_to_submit->finished_len; + assert(req->req->len > req->finished_len); + req->iov.iov_base = req->req->scratch + req->finished_len; + req->iov.iov_len = req->req->len - req->finished_len; struct io_uring_sqe* sqe; sqe = io_uring_get_sqe(iu); - io_uring_prep_readv( - sqe, fd_, &rep_to_submit->iov, 1, - rep_to_submit->req->offset + rep_to_submit->finished_len); - io_uring_sqe_set_data(sqe, rep_to_submit); - wrap_cache.emplace(rep_to_submit); + // NULL is unexpected as we do maintain proper ring accounting. + assert(sqe); + io_uring_prep_readv(sqe, fd_, &req->iov, 1, + req->req->offset + req->finished_len); + io_uring_sqe_set_data(sqe, req); + wrap_cache.emplace(req); } - incomplete_rq_list.clear(); + resubmit_rq_list.clear(); + + struct io_uring_cqe* cqe = nullptr; + unsigned head; + ssize_t err = 0; + bool memory_pressure_on_submission = false; + unsigned reqs_pending_submission; + unsigned reqs_submitted = 0; + while ((reqs_pending_submission = io_uring_sq_ready(iu))) { + // MultiRead is synchronous in nature. io_uring_submit_and_wait provides + // batching semantics (submit + best effort wait in one syscall), while + // io_uring_submit enables async producer/consumer semantics (submit + // only, requires separate reaping). We chose batching approach to + // reduce the volume of syscalls and context switches. + ssize_t ret = io_uring_submit_and_wait(iu, reqs_pending_submission); + if (ret < 0) { + if (-EINTR == ret || -EAGAIN == ret) { + // Submission failed due to rare, retryable syscall error. Try again. + continue; + } + if (-ENOMEM == ret) { + fprintf(stderr, + "PosixRandomAccessFile::MultiRead: io_uring_submit_and_wait " + "experienced terse memory condition.\n"); + // Best effort to reclaim resources in terse condition. + memory_pressure_on_submission = true; + } else { + fprintf(stderr, + "PosixRandomAccessFile::MultiRead: " + "io_uring_submit_and_wait returned terminal error: %zd.\n", + ret); + err = ret; + } + break; + } + if (0 == ret) { + // This scenario is unexpected for any modern kernel! + // We deliberately error out to avoid bugs around infinite loops. + fprintf(stderr, + "PosixRandomAccessFile::MultiRead: " + "io_uring_submit_and_wait returned 0 submissions!\n"); + break; + } + reqs_submitted += static_cast(ret); + }; + reqs_pending_submission = io_uring_sq_ready(iu); - ssize_t ret = - io_uring_submit_and_wait(iu, static_cast(this_reqs)); TEST_SYNC_POINT_CALLBACK( - "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", - &ret); - TEST_SYNC_POINT_CALLBACK( - "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", - iu); - - if (static_cast(ret) != this_reqs) { - fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs); - // If error happens and we submitted fewer than expected, it is an - // exception case and we don't retry here. We should still consume - // what is is submitted in the ring. - for (ssize_t i = 0; i < ret; i++) { - struct io_uring_cqe* cqe = nullptr; - io_uring_wait_cqe(iu, &cqe); - if (cqe != nullptr) { - io_uring_cqe_seen(iu, cqe); + "PosixRandomAccessFile::MultiRead:io_uring_sq_ready:return1", + &reqs_pending_submission); + + // Error occurred or IO uring stopped submitting outstanding requests. + if (reqs_pending_submission && !memory_pressure_on_submission) { + // IO ring is initialized once in thread-local variable and then reused + // to handle the consecutive MultiRead API calls. Therefore, it's crucial + // to reap all the submitted requests. + // + // NOTE: Loop will run indefinitely until we reap all the completions!!! + size_t nr = 0; + assert(reqs_pending_submission <= wrap_cache.size()); + size_t nr_await_cqe = wrap_cache.size() - reqs_pending_submission; + while (nr < nr_await_cqe) { + // blocking + io_uring_wait_cqes(iu, &cqe, + static_cast(nr_await_cqe - nr), + nullptr, nullptr); + size_t reaped_cqe_count = 0; + io_uring_for_each_cqe(iu, head, cqe) { reaped_cqe_count++; } + if (reaped_cqe_count > 0) { + io_uring_cq_advance(iu, static_cast(reaped_cqe_count)); + nr += reaped_cqe_count; } } - return IOStatus::IOError("io_uring_submit_and_wait() requested " + - std::to_string(this_reqs) + " but returned " + - std::to_string(ret)); - } - for (size_t i = 0; i < this_reqs; i++) { - struct io_uring_cqe* cqe = nullptr; - WrappedReadRequest* req_wrap; - - // We could use the peek variant here, but this seems safer in terms - // of our initial wait not reaping all completions - ret = io_uring_wait_cqe(iu, &cqe); TEST_SYNC_POINT_CALLBACK( - "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret); - if (ret) { - ios = IOStatus::IOError("io_uring_wait_cqe() returns " + - std::to_string(ret)); - - if (cqe != nullptr) { - io_uring_cqe_seen(iu, cqe); - } - continue; + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", + iu); + + // While all the submitted completions have been reaped successfully, + // IO ring submission queue still contains at least one non-submitted + // request. Destroy io_uring (discards unsubmitted SQEs). + // + // NOTE: This is a rare scenario and should not happen in normal cases. + // Hence, this should NOT materially impact the performance metrics. + io_uring_queue_exit(iu); + delete iu; + thread_local_multi_read_io_urings_->Reset(nullptr); + + if (err < 0) { + return IOStatus::IOError( + "io_uring_submit_and_wait() failed with an error " + + std::to_string(err)); } + return IOStatus::IOError( + "io_uring_submit_and_wait() requested " + + std::to_string(reqs_submitted + reqs_pending_submission) + + " but returned " + std::to_string(reqs_submitted)); + } - req_wrap = static_cast(io_uring_cqe_get_data(cqe)); - // Reset cqe data to catch any stray reuse of it - static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; - // Check that we got a valid unique cqe data - auto wrap_check = wrap_cache.find(req_wrap); - if (wrap_check == wrap_cache.end()) { - fprintf(stderr, - "PosixRandomAccessFile::MultiRead: " - "Bad cqe data from IO uring - %p\n", - req_wrap); - port::PrintStack(); - ios = IOStatus::IOError("io_uring_cqe_get_data() returned " + - std::to_string((uint64_t)req_wrap)); - continue; - } - wrap_cache.erase(wrap_check); - - FSReadRequest* req = req_wrap->req; - size_t bytes_read = 0; - bool read_again = false; - UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len, - false /*async_read*/, use_direct_io(), - GetRequiredBufferAlignment(), req_wrap->finished_len, req, - bytes_read, read_again); - int32_t res = cqe->res; - if (res >= 0) { - if (bytes_read == 0) { + if ((0 == reqs_submitted) && wrap_cache.size() > reqs_pending_submission) { + // If no requests have been submitted and there is at least one request + // pending completion, wait for at least one completion to arrive. + // This is a guardrail to prevent the busy CPU loops. + // + // NOTE: it's not really a tight CPU-burning loop in the traditional sense + // as it's naturally throttled by the io_uring_submit_and_wait() syscall. + io_uring_wait_cqe(iu, &cqe); + } + + unsigned int nr = 0; + io_uring_for_each_cqe(iu, head, cqe) { // non-blocking + if (cqe->user_data) { // non-discarded, valid user data only! + nr++; + WrappedReadRequest* req_wrap = + static_cast(io_uring_cqe_get_data(cqe)); + // Reset cqe data to catch any stray reuse of it + static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + // Check that we got a valid unique cqe data + auto wrap_check = wrap_cache.find(req_wrap); + if (wrap_check == wrap_cache.end()) { + fprintf(stderr, + "PosixRandomAccessFile::MultiRead: " + "Bad cqe data from IO uring - %p\n", + req_wrap); + port::PrintStack(); + ios = IOStatus::IOError("io_uring_cqe_get_data() returned " + + std::to_string((uint64_t)req_wrap)); + continue; + } + wrap_cache.erase(wrap_check); + if (cqe->res < 0) { + if (-EINTR == cqe->res || -EAGAIN == cqe->res) { + resubmit_rq_list.push_back(req_wrap); + } else { + ios = IOStatus::IOError("io_uring_for_each_cqe() returns " + + std::to_string(cqe->res)); + } + continue; + } + // cqe->res >= 0 + FSReadRequest* req = req_wrap->req; + size_t bytes_read = 0; + bool read_again = false; + UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len, + false /*async_read*/, use_direct_io(), + GetRequiredBufferAlignment(), req_wrap->finished_len, req, + bytes_read, read_again); + + if (0 == bytes_read) { if (read_again) { Slice tmp_slice; req->status = @@ -808,14 +978,15 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, req->result = Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); } - // else It means EOF so no need to do anything. + // else it means EOF so no need to do anything. } else if (bytes_read < req_wrap->iov.iov_len) { - incomplete_rq_list.push_back(req_wrap); + resubmit_rq_list.push_back(req_wrap); } } - io_uring_cqe_seen(iu, cqe); } - wrap_cache.clear(); + if (nr > 0) { + io_uring_cq_advance(iu, nr); + } } return ios; #else @@ -912,19 +1083,21 @@ IOStatus PosixRandomAccessFile::ReadAsync( #if defined(ROCKSDB_IOURING_PRESENT) // io_uring_queue_init. struct io_uring* iu = nullptr; - if (thread_local_io_urings_) { - iu = static_cast(thread_local_io_urings_->Get()); + if (thread_local_async_read_io_urings_) { + iu = static_cast( + thread_local_async_read_io_urings_->Get()); if (iu == nullptr) { iu = CreateIOUring(); if (iu != nullptr) { - thread_local_io_urings_->Reset(iu); + thread_local_async_read_io_urings_->Reset(iu); } } } // Init failed, platform doesn't support io_uring. if (iu == nullptr) { - return IOStatus::NotSupported("ReadAsync"); + fprintf(stderr, "failed to init io_uring\n"); + return IOStatus::NotSupported("ReadAsync: failed to init io_uring"); } // Allocate io_handle. @@ -954,11 +1127,35 @@ IOStatus PosixRandomAccessFile::ReadAsync( io_uring_sqe_set_data(sqe, posix_handle); // Step 4: io_uring_submit - ssize_t ret = io_uring_submit(iu); - if (ret < 0) { - fprintf(stderr, "io_uring_submit error: %ld\n", long(ret)); - return IOStatus::IOError("io_uring_submit() requested but returned " + - std::to_string(ret)); + ssize_t ret; + do { + ret = io_uring_submit(iu); + if (ret < 0) { + if (-EINTR == ret || -EAGAIN == ret) { + // Submission failed due to transient error. Try again. + continue; + } + fprintf(stderr, + "PosixRandomAccessFile::ReadAsync: " + "io_uring_submit returned terminal error = %zd\n", + ret); + break; + } + if (0 == ret) { + // Unexpected. Will be reported as error. + break; + } + } while (ret < 1); + if (ret <= 0) { + return IOStatus::IOError( + "PosixRandomAccessFile::ReadAsync: io_uring_submit() returned " + + std::to_string(ret)); + } + if (ret > 1) { + fprintf(stderr, + "PosixRandomAccessFile::ReadAsync: " + "io_uring_submit() returned = %zd\n", + ret); } return IOStatus::OK(); #else @@ -967,7 +1164,8 @@ IOStatus PosixRandomAccessFile::ReadAsync( (void)cb_arg; (void)io_handle; (void)del_fn; - return IOStatus::NotSupported("ReadAsync"); + return IOStatus::NotSupported( + "ReadAsync: ROCKSDB_IOURING_PRESENT is not set"); #endif } @@ -1056,6 +1254,11 @@ IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) { #endif } +IOStatus PosixMmapReadableFile::GetFileSize(uint64_t* result) { + *result = length_; + return IOStatus::OK(); +} + /* * PosixMmapFile * @@ -1138,7 +1341,8 @@ IOStatus PosixMmapFile::Msync() { } PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size, - const EnvOptions& options) + const EnvOptions& options, + uint64_t initial_file_size) : filename_(fname), fd_(fd), page_size_(page_size), @@ -1147,7 +1351,7 @@ PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size, limit_(nullptr), dst_(nullptr), last_sync_(nullptr), - file_offset_(0) { + file_offset_(initial_file_size) { #ifdef ROCKSDB_FALLOCATE_PRESENT allow_fallocate_ = options.allow_fallocate; fallocate_with_keep_size_ = options.fallocate_with_keep_size; @@ -1317,12 +1521,13 @@ IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len, */ PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, size_t logical_block_size, - const EnvOptions& options) + const EnvOptions& options, + uint64_t initial_file_size) : FSWritableFile(options), filename_(fname), use_direct_io_(options.use_direct_writes), fd_(fd), - filesize_(0), + filesize_(initial_file_size), logical_sector_size_(logical_block_size) { #ifdef ROCKSDB_FALLOCATE_PRESENT allow_fallocate_ = options.allow_fallocate; @@ -1386,6 +1591,7 @@ IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/, filename_, errno); } else { filesize_ = size; + lseek(fd_, filesize_, SEEK_SET); } return s; } diff --git a/env/io_posix.h b/env/io_posix.h index 60788df9bf8b..bca0c5836a63 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -11,6 +11,16 @@ #if defined(ROCKSDB_IOURING_PRESENT) #include #include + +// Compatibility defines for io_uring flags that may not be present in older +// kernel headers. These values are fixed and won't change, so it's safe to +// define them even if the running kernel doesn't support them. +#ifndef IORING_SETUP_SINGLE_ISSUER +#define IORING_SETUP_SINGLE_ISSUER (1U << 12) +#endif +#ifndef IORING_SETUP_DEFER_TASKRUN +#define IORING_SETUP_DEFER_TASKRUN (1U << 13) +#endif #endif #include @@ -117,6 +127,7 @@ struct Posix_IOHandle { use_direct_io(_use_direct_io), alignment(_alignment), is_finished(false), + is_being_aborted(false), req_count(0) {} struct iovec iov; @@ -129,6 +140,10 @@ struct Posix_IOHandle { bool use_direct_io; size_t alignment; bool is_finished; + // is_being_aborted is set by AbortIO when a cancel request is submitted. + // Used to distinguish between aborted handles (expect 2 completions) and + // non-aborted handles (expect 1 completion) when processing completions. + bool is_being_aborted; // req_count is used by AbortIO API to keep track of number of requests. uint32_t req_count; }; @@ -187,6 +202,27 @@ inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name, (void)len; #endif } + +// Finalize a completed async read request. +// Processes the CQE result, marks the handle as finished, and invokes the +// callback. This is shared between Poll and AbortIO (for non-aborted handles). +inline void FinalizeAsyncRead(struct io_uring* iu, struct io_uring_cqe* cqe, + Posix_IOHandle* posix_handle) { + FSReadRequest req; + req.scratch = posix_handle->scratch; + req.offset = posix_handle->offset; + req.len = posix_handle->len; + + size_t finished_len = 0; + size_t bytes_read = 0; + bool read_again = false; + UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, true /*async_read*/, + posix_handle->use_direct_io, posix_handle->alignment, + finished_len, &req, bytes_read, read_again); + posix_handle->is_finished = true; + io_uring_cqe_seen(iu, cqe); + posix_handle->cb(req, posix_handle->cb_arg); +} #endif #ifdef OS_LINUX @@ -299,7 +335,10 @@ inline void DeleteIOUring(void* p) { inline struct io_uring* CreateIOUring() { struct io_uring* new_io_uring = new struct io_uring; - int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0); + unsigned int flags = 0; + flags |= IORING_SETUP_SINGLE_ISSUER; + flags |= IORING_SETUP_DEFER_TASKRUN; + int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, flags); if (ret) { delete new_io_uring; new_io_uring = nullptr; @@ -315,7 +354,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile { bool use_direct_io_; size_t logical_sector_size_; #if defined(ROCKSDB_IOURING_PRESENT) - ThreadLocalPtr* thread_local_io_urings_; + ThreadLocalPtr* thread_local_async_read_io_urings_; + ThreadLocalPtr* thread_local_multi_read_io_urings_; #endif public: @@ -323,7 +363,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile { size_t logical_block_size, const EnvOptions& options #if defined(ROCKSDB_IOURING_PRESENT) , - ThreadLocalPtr* thread_local_io_urings + ThreadLocalPtr* thread_local_async_read_io_urings, + ThreadLocalPtr* thread_local_multi_read_io_urings #endif ); virtual ~PosixRandomAccessFile(); @@ -352,6 +393,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile { void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; + + virtual IOStatus GetFileSize(uint64_t* result) override; }; class PosixWritableFile : public FSWritableFile { @@ -374,7 +417,8 @@ class PosixWritableFile : public FSWritableFile { public: explicit PosixWritableFile(const std::string& fname, int fd, size_t logical_block_size, - const EnvOptions& options); + const EnvOptions& options, + uint64_t initial_file_size); virtual ~PosixWritableFile(); // Need to implement this so the file is truncated correctly @@ -436,6 +480,7 @@ class PosixMmapReadableFile : public FSRandomAccessFile { char* scratch, IODebugContext* dbg) const override; void Hint(AccessPattern pattern) override; IOStatus InvalidateCache(size_t offset, size_t length) override; + virtual IOStatus GetFileSize(uint64_t* result) override; }; class PosixMmapFile : public FSWritableFile { @@ -469,7 +514,7 @@ class PosixMmapFile : public FSWritableFile { public: PosixMmapFile(const std::string& fname, int fd, size_t page_size, - const EnvOptions& options); + const EnvOptions& options, uint64_t initial_file_size); ~PosixMmapFile(); // Means Close() will properly take care of truncate diff --git a/env/io_posix_test.cc b/env/io_posix_test.cc index 81ce5058708b..6daff356afaf 100644 --- a/env/io_posix_test.cc +++ b/env/io_posix_test.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "test_util/testharness.h" +#include "util/random.h" #ifdef ROCKSDB_LIB_IO_POSIX #include "env/io_posix.h" @@ -131,6 +132,48 @@ TEST_F(LogicalBlockSizeCacheTest, Ref) { } #endif +class PosixWritableFileTest : public testing::Test {}; + +TEST_F(PosixWritableFileTest, SeekAfterTruncate) { + std::shared_ptr fs = FileSystem::Default(); + std::string path = + test::PerThreadDBPath("PosixWritableFileTest_SeekAfterTruncate"); + Random rnd(300); + std::unique_ptr wfile; + + ASSERT_OK(fs->NewWritableFile(path, FileOptions(), &wfile, nullptr)); + ASSERT_OK(wfile->Append(rnd.RandomString(16384), IOOptions(), nullptr)); + ASSERT_OK(wfile->Truncate(4096, IOOptions(), nullptr)); + ASSERT_OK(wfile->Append(rnd.RandomString(4096), IOOptions(), nullptr)); + ASSERT_OK(wfile->Close(IOOptions(), nullptr)); + wfile.reset(); + + uint64_t size = 0; + ASSERT_OK(fs->GetFileSize(path, IOOptions(), &size, nullptr)); + ASSERT_EQ(size, 8192); + ASSERT_OK(fs->DeleteFile(path, IOOptions(), nullptr)); +} + +TEST_F(PosixWritableFileTest, SeekAfterExtend) { + std::shared_ptr fs = FileSystem::Default(); + std::string path = + test::PerThreadDBPath("PosixWritableFileTest_SeekAfterTruncate"); + Random rnd(300); + std::unique_ptr wfile; + + ASSERT_OK(fs->NewWritableFile(path, FileOptions(), &wfile, nullptr)); + ASSERT_OK(wfile->Append(rnd.RandomString(4096), IOOptions(), nullptr)); + ASSERT_OK(wfile->Truncate(8192, IOOptions(), nullptr)); + ASSERT_OK(wfile->Append(rnd.RandomString(8192), IOOptions(), nullptr)); + ASSERT_OK(wfile->Close(IOOptions(), nullptr)); + wfile.reset(); + + uint64_t size = 0; + ASSERT_OK(fs->GetFileSize(path, IOOptions(), &size, nullptr)); + ASSERT_EQ(size, 16384); + ASSERT_OK(fs->DeleteFile(path, IOOptions(), nullptr)); +} + } // namespace ROCKSDB_NAMESPACE #endif diff --git a/env/mock_env.cc b/env/mock_env.cc index bf0e76adbbe4..0f9e5ab47f67 100644 --- a/env/mock_env.cc +++ b/env/mock_env.cc @@ -322,6 +322,11 @@ class MockRandomAccessFile : public FSRandomAccessFile { } } + IOStatus GetFileSize(uint64_t* size) override { + *size = file_->Size(); + return IOStatus::OK(); + } + private: MemFile* file_; bool use_direct_io_; diff --git a/examples/Makefile b/examples/Makefile index b056508a6c3f..0970cfd4002d 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -19,16 +19,16 @@ CFLAGS += -Wstrict-prototypes all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example rocksdb_backup_restore_example simple_example: librocksdb simple_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) column_families_example: librocksdb column_families_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) compaction_filter_example: librocksdb compaction_filter_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) compact_files_example: librocksdb compact_files_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) .c.o: $(CC) $(CFLAGS) -c $< -o $@ -I../include @@ -37,19 +37,19 @@ c_simple_example: librocksdb c_simple_example.o $(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS) optimistic_transaction_example: librocksdb optimistic_transaction_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) transaction_example: librocksdb transaction_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) options_file_example: librocksdb options_file_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) multi_processes_example: librocksdb multi_processes_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) rocksdb_backup_restore_example: librocksdb rocksdb_backup_restore_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) clean: rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example ./rocksdb_backup_restore_example diff --git a/examples/column_families_example.cc b/examples/column_families_example.cc index 3828d3fb3f73..f8ce4b8c7013 100644 --- a/examples/column_families_example.cc +++ b/examples/column_families_example.cc @@ -3,6 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #include +#include #include #include @@ -32,7 +33,7 @@ int main() { // open DB Options options; options.create_if_missing = true; - DB* db; + std::unique_ptr db; Status s = DB::Open(options, kDBPath, &db); assert(s.ok()); @@ -44,7 +45,7 @@ int main() { // close DB s = db->DestroyColumnFamilyHandle(cf); assert(s.ok()); - delete db; + db.reset(); // open DB with two column families std::vector column_families; @@ -82,7 +83,7 @@ int main() { s = db->DestroyColumnFamilyHandle(handle); assert(s.ok()); } - delete db; + db.reset(); return 0; } diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc index 52b054002d76..cc9e04e4506b 100644 --- a/examples/compact_files_example.cc +++ b/examples/compact_files_example.cc @@ -6,6 +6,7 @@ // An example code demonstrating how to use CompactFiles, EventListener, // and GetColumnFamilyMetaData APIs to implement custom compaction algorithm. +#include #include #include @@ -151,10 +152,12 @@ int main() { options.IncreaseParallelism(5); options.listeners.emplace_back(new FullCompactor(options)); - DB* db = nullptr; + std::unique_ptr db; ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options); - Status s = DB::Open(options, kDBPath, &db); - assert(s.ok()); + { + Status s = DB::Open(options, kDBPath, &db); + assert(s.ok()); + } assert(db); // if background compaction is not working, write will stall @@ -172,7 +175,7 @@ int main() { } // close the db. - delete db; + db.reset(); return 0; } diff --git a/examples/compaction_filter_example.cc b/examples/compaction_filter_example.cc index 03a1952600d7..9c17a229940b 100644 --- a/examples/compaction_filter_example.cc +++ b/examples/compaction_filter_example.cc @@ -63,7 +63,7 @@ std::string kRemoveDirCommand = "rm -rf "; #endif int main() { - ROCKSDB_NAMESPACE::DB* raw_db; + std::unique_ptr db; ROCKSDB_NAMESPACE::Status status; MyFilter filter; @@ -77,9 +77,8 @@ int main() { options.create_if_missing = true; options.merge_operator.reset(new MyMerge); options.compaction_filter = &filter; - status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &raw_db); + status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db); assert(status.ok()); - std::unique_ptr db(raw_db); ROCKSDB_NAMESPACE::WriteOptions wopts; db->Merge(wopts, "0", "bad"); // This is filtered out diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc index b9a6cbe207d1..20a3af3637b4 100644 --- a/examples/multi_processes_example.cc +++ b/examples/multi_processes_example.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -147,7 +148,7 @@ void CreateDB() { assert(false); } options.create_if_missing = true; - DB* db = nullptr; + std::unique_ptr db; s = DB::Open(options, kDBPath, &db); if (!s.ok()) { fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid, @@ -173,7 +174,7 @@ void CreateDB() { delete h; } handles.clear(); - delete db; + db.reset(); } void RunPrimary() { @@ -181,7 +182,7 @@ void RunPrimary() { fprintf(stdout, "[process %ld] Primary instance starts\n", my_pid); CreateDB(); std::srand(time(nullptr)); - DB* db = nullptr; + std::unique_ptr db; Options options; options.create_if_missing = false; std::vector column_families; @@ -227,8 +228,7 @@ void RunPrimary() { delete h; } handles.clear(); - delete db; - db = nullptr; + db.reset(); } } if (nullptr != db) { @@ -236,8 +236,7 @@ void RunPrimary() { delete h; } handles.clear(); - delete db; - db = nullptr; + db.reset(); } fprintf(stdout, "[process %ld] Finished adding keys\n", my_pid); } @@ -262,7 +261,7 @@ void RunSecondary() { exit(0); } } - DB* db = nullptr; + std::unique_ptr db; Options options; options.create_if_missing = false; options.max_open_files = -1; @@ -344,7 +343,7 @@ void RunSecondary() { column_families.push_back(ColumnFamilyDescriptor(cf_name, options)); } std::vector handles; - DB* verification_db = nullptr; + std::unique_ptr verification_db; s = DB::OpenForReadOnly(options, kDBPath, column_families, &handles, &verification_db); assert(s.ok()); @@ -369,8 +368,8 @@ void RunSecondary() { } delete iter; delete iter1; - delete db; - delete verification_db; + db.reset(); + verification_db.reset(); } int main(int argc, char** argv) { diff --git a/examples/options_file_example.cc b/examples/options_file_example.cc index 00632f391ae9..09be3185ca88 100644 --- a/examples/options_file_example.cc +++ b/examples/options_file_example.cc @@ -7,6 +7,7 @@ // rocksdb/utilities/options_util.h to open a rocksdb database without // remembering all the rocksdb options. #include +#include #include #include @@ -74,7 +75,7 @@ int main() { cf_descs[1].options.table_factory.reset(NewBlockBasedTableFactory(bbt_opts)); // destroy and open DB - DB* db; + std::unique_ptr db; Status s = ROCKSDB_NAMESPACE::DestroyDB(kDBPath, Options(db_opt, cf_descs[0].options)); assert(s.ok()); @@ -88,7 +89,7 @@ int main() { // close DB delete cf; - delete db; + db.reset(); // In the following code, we will reopen the rocksdb instance using // the options file stored in the db directory. @@ -128,5 +129,5 @@ int main() { for (auto* handle : handles) { delete handle; } - delete db; + db.reset(); } diff --git a/examples/rocksdb_backup_restore_example.cc b/examples/rocksdb_backup_restore_example.cc index c833ed1c2a8f..e5ad703eed8d 100644 --- a/examples/rocksdb_backup_restore_example.cc +++ b/examples/rocksdb_backup_restore_example.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include +#include #include #include @@ -29,7 +30,7 @@ std::string kDBPath = "/tmp/rocksdb_example"; #endif int main() { - DB* db; + std::unique_ptr db; Options options; // Optimize RocksDB. This is the easiest way to get RocksDB to perform well options.IncreaseParallelism(); @@ -52,7 +53,7 @@ int main() { &backup_engine); assert(s.ok()); - backup_engine->CreateNewBackup(db); + backup_engine->CreateNewBackup(db.get()); assert(s.ok()); std::vector backup_info; @@ -65,9 +66,7 @@ int main() { db->Put(WriteOptions(), "key2", "value2"); assert(s.ok()); - db->Close(); - delete db; - db = nullptr; + db.reset(); // restore db to backup 1 BackupEngineReadOnly* backup_engine_ro; @@ -93,7 +92,7 @@ int main() { delete backup_engine; delete backup_engine_ro; - delete db; + db.reset(); return 0; } diff --git a/examples/simple_example.cc b/examples/simple_example.cc index 2d49c4d14da2..85a87da77cea 100644 --- a/examples/simple_example.cc +++ b/examples/simple_example.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include +#include #include #include "rocksdb/db.h" @@ -25,7 +26,7 @@ std::string kDBPath = "/tmp/rocksdb_simple_example"; #endif int main() { - DB* db; + std::unique_ptr db; Options options; // Optimize RocksDB. This is the easiest way to get RocksDB to perform well options.IncreaseParallelism(); @@ -87,7 +88,7 @@ int main() { pinnable_val.Reset(); // The Slice pointed by pinnable_val is not valid after this point - delete db; + db.reset(); return 0; } diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index b06409a5dcbb..79bb63c5b3d9 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -130,6 +130,7 @@ Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path, s.ToString().c_str()); if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s", file_path.c_str(), s.ToString().c_str()); s = fs_->DeleteFile(file_path, IOOptions(), nullptr); @@ -151,6 +152,7 @@ Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path, if (io_s.ok()) { total_trash_size_.fetch_add(trash_file_size); } + IGNORE_STATUS_IF_ERROR(s); } //**TODO: What should we do if we failed to // get the file size? diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index 7683db861732..ab78fccf72b4 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -126,6 +126,8 @@ Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts, if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) { RecordTick(stats_, PREFETCH_BYTES, read_len); + } else if (usage_ == FilePrefetchBufferUsage::kCompactionPrefetch) { + RecordInHistogram(stats_, COMPACTION_PREFETCH_BYTES, read_len); } if (!use_fs_buffer) { // Update the buffer size. @@ -154,8 +156,22 @@ Status FilePrefetchBuffer::ReadAsync(BufferInfo* buf, const IOOptions& opts, &(buf->del_fn_), /*aligned_buf =*/nullptr); req.status.PermitUncheckedError(); if (s.ok()) { - RecordTick(stats_, PREFETCH_BYTES, read_len); + if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) { + RecordTick(stats_, PREFETCH_BYTES, read_len); + } buf->async_read_in_progress_ = true; + } else if (s.IsNotSupported()) { + // Async IO is not available (e.g., io_uring failed to initialize). + // Fall back to synchronous read so the buffer is populated inline + // and callers proceed transparently. + s = reader->Read(opts, start_offset, read_len, &result, + buf->buffer_.BufferStart(), /*aligned_buf=*/nullptr); + if (s.ok()) { + buf->buffer_.Size(buf->CurrentSize() + result.size()); + if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) { + RecordTick(stats_, PREFETCH_BYTES, read_len); + } + } } return s; } @@ -347,7 +363,7 @@ void FilePrefetchBuffer::ClearOutdatedData(uint64_t offset, size_t length) { assert(IsBufferQueueEmpty() || buf->IsOffsetInBuffer(offset)); } -void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) { +Status FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) { BufferInfo* buf = GetFirstBuffer(); if (buf->async_read_in_progress_ && fs_ != nullptr) { @@ -358,7 +374,16 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) { std::vector handles; handles.emplace_back(buf->io_handle_); StopWatch sw(clock_, stats_, POLL_WAIT_MICROS); - fs_->Poll(handles, 1).PermitUncheckedError(); + IOStatus io_s = fs_->Poll(handles, 1); + // Allow tests to inject Poll errors + TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::PollIfNeeded:IOStatus", + &io_s); + if (!io_s.ok()) { + // On Poll failure, clean up the handle and abort. + // DestroyAndClearIOHandle also sets async_read_in_progress_ to false. + DestroyAndClearIOHandle(buf); + return io_s; + } } // Reset and Release io_handle after the Poll API as request has been @@ -369,6 +394,7 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) { // Always call outdated data after Poll as Buffers might be out of sync w.r.t // offset and length. ClearOutdatedData(offset, length); + return Status::OK(); } // ReadAheadSizeTuning API calls readaheadsize_cb_ @@ -507,7 +533,10 @@ Status FilePrefetchBuffer::HandleOverlappingAsyncData( // by Seek, but the next access is at another offset. if (buf->async_read_in_progress_ && buf->IsOffsetInBufferWithAsyncProgress(offset)) { - PollIfNeeded(offset, length); + Status poll_status = PollIfNeeded(offset, length); + if (!poll_status.ok()) { + return poll_status; + } } if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) { @@ -642,7 +671,10 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts, return s; } } else { - PollIfNeeded(tmp_offset, tmp_length); + Status poll_status = PollIfNeeded(tmp_offset, tmp_length); + if (!poll_status.ok()) { + return poll_status; + } } AllocateBufferIfEmpty(); diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index b8b6812bc83d..5ebf1f051df9 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -93,8 +93,8 @@ struct BufferInfo { // // For example - if end offset of previous buffer was 100 and because of // readahead_size optimization, end_offset was trimmed to 60. Then for next - // prefetch call, start_offset should be intialized to 100 i.e start_offset = - // buf->initial_end_offset_. + // prefetch call, start_offset should be initialized to 100 i.e start_offset + // = buf->initial_end_offset_. uint64_t initial_end_offset_ = 0; bool IsDataBlockInBuffer(uint64_t offset, size_t length) { @@ -134,6 +134,7 @@ struct BufferInfo { enum class FilePrefetchBufferUsage { kTableOpenPrefetchTail, kUserScanPrefetch, + kCompactionPrefetch, kUnknown, }; @@ -154,7 +155,7 @@ enum class FilePrefetchBufferUsage { // When reusing the file system allocated buffer, overlap_buf_ is used if the // main buffer only contains part of the requested data. It is returned to // the caller after the remaining data is fetched. -// If num_buffers_ > 1, then the data is prefetched asynchronosuly in the +// If num_buffers_ > 1, then the data is prefetched asynchronously in the // buffers whenever the data is consumed from the buffers and that buffer is // freed. // If num_buffers > 1, then requested data can be overlapping between 2 buffers. @@ -430,7 +431,7 @@ class FilePrefetchBuffer { void ClearOutdatedData(uint64_t offset, size_t len); // It calls Poll API to check for any pending asynchronous request. - void PollIfNeeded(uint64_t offset, size_t len); + Status PollIfNeeded(uint64_t offset, size_t len); Status PrefetchInternal(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t length, size_t readahead_size, @@ -574,6 +575,9 @@ class FilePrefetchBuffer { size_t& read_len, uint64_t& aligned_useful_len); void UpdateStats(bool found_in_buffer, size_t length_found) { + if (usage_ != FilePrefetchBufferUsage::kUserScanPrefetch) { + return; + } if (found_in_buffer) { RecordTick(stats_, PREFETCH_HITS); } diff --git a/file/file_util.cc b/file/file_util.cc index 105e88690226..c44d799b8ce4 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -22,7 +22,10 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, Temperature src_temp_hint, std::unique_ptr& dest_writer, uint64_t size, bool use_fsync, - const std::shared_ptr& io_tracer) { + const std::shared_ptr& io_tracer, + uint64_t max_read_buffer_size, + const std::optional& readIOOptions, + const std::optional& writeIOOptions) { FileOptions soptions; IOStatus io_s; std::unique_ptr src_reader; @@ -38,7 +41,8 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, if (size == 0) { // default argument means copy everything - io_s = fs->GetFileSize(source, opts, &size, nullptr); + io_s = + fs->GetFileSize(source, readIOOptions.value_or(opts), &size, nullptr); if (!io_s.ok()) { return io_s; } @@ -47,14 +51,23 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, new SequentialFileReader(std::move(srcfile), source, io_tracer)); } - char buffer[4096]; + const size_t read_buffer_size = std::max( + static_cast(4096), static_cast(max_read_buffer_size)); + std::unique_ptr buffer; + buffer.reset(new char[read_buffer_size]); + + Env::IOPriority read_rate_limiter_priority = Env::IO_TOTAL; + if (readIOOptions.has_value()) { + read_rate_limiter_priority = readIOOptions.value().rate_limiter_priority; + } Slice slice; while (size > 0) { - size_t bytes_to_read = std::min(sizeof(buffer), static_cast(size)); + size_t bytes_to_read = std::min(static_cast(read_buffer_size), + static_cast(size)); // TODO: rate limit copy file - io_s = status_to_io_status( - src_reader->Read(bytes_to_read, &slice, buffer, - Env::IO_TOTAL /* rate_limiter_priority */)); + io_s = status_to_io_status(src_reader->Read( + bytes_to_read, &slice, buffer.get(), + read_rate_limiter_priority /* rate_limiter_priority */)); if (!io_s.ok()) { return io_s; } @@ -65,19 +78,22 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, std::to_string(dest_writer->GetFileSize())); } - io_s = dest_writer->Append(opts, slice); + io_s = dest_writer->Append(writeIOOptions.value_or(opts), slice); if (!io_s.ok()) { return io_s; } size -= slice.size(); } - return dest_writer->Sync(opts, use_fsync); + return dest_writer->Sync(writeIOOptions.value_or(opts), use_fsync); } IOStatus CopyFile(FileSystem* fs, const std::string& source, Temperature src_temp_hint, const std::string& destination, Temperature dst_temp, uint64_t size, bool use_fsync, - const std::shared_ptr& io_tracer) { + const std::shared_ptr& io_tracer, + uint64_t max_read_buffer_size, + const std::optional& readIOOptions, + const std::optional& writeIOOptions) { FileOptions options; IOStatus io_s; std::unique_ptr dest_writer; @@ -96,7 +112,8 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, } return CopyFile(fs, source, src_temp_hint, dest_writer, size, use_fsync, - io_tracer); + io_tracer, max_read_buffer_size, readIOOptions, + writeIOOptions); } // Utility function to create a file with the provided contents @@ -161,7 +178,8 @@ IOStatus GenerateOneFileChecksum( std::string* file_checksum_func_name, size_t verify_checksums_readahead_size, bool /*allow_mmap_reads*/, std::shared_ptr& io_tracer, RateLimiter* rate_limiter, - const ReadOptions& read_options, Statistics* stats, SystemClock* clock) { + const ReadOptions& read_options, Statistics* stats, SystemClock* clock, + const FileOptions& file_options) { if (checksum_factory == nullptr) { return IOStatus::InvalidArgument("Checksum factory is invalid"); } @@ -201,7 +219,12 @@ IOStatus GenerateOneFileChecksum( std::unique_ptr reader; { std::unique_ptr r_file; - io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr); + FileOptions fopts = file_options; + if (fopts.file_checksum.empty()) { + // No expected checksum is known — this is a from-scratch computation. + fopts.file_checksum_func_name = kNoFileChecksumFuncName; + } + io_s = fs->NewRandomAccessFile(file_path, fopts, &r_file, nullptr); if (!io_s.ok()) { return io_s; } @@ -230,15 +253,16 @@ IOStatus GenerateOneFileChecksum( Slice slice; uint64_t offset = 0; IOOptions opts; - io_s = reader->PrepareIOOptions(read_options, opts); + IODebugContext dbg; + io_s = reader->PrepareIOOptions(read_options, opts, &dbg); if (!io_s.ok()) { return io_s; } while (size > 0) { size_t bytes_to_read = static_cast(std::min(uint64_t{readahead_size}, size)); - io_s = - reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr); + io_s = reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr, + &dbg); if (!io_s.ok()) { return IOStatus::Corruption("file read failed with error: " + io_s.ToString()); diff --git a/file/file_util.h b/file/file_util.h index 8a72fea27ad3..f460a30caa9b 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -24,18 +24,28 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source, Temperature src_temp_hint, std::unique_ptr& dest_writer, uint64_t size, bool use_fsync, - const std::shared_ptr& io_tracer); + const std::shared_ptr& io_tracer, + uint64_t max_read_buffer_size = 4096, + const std::optional& readIOOptions = {}, + const std::optional& writeIOOptions = {}); IOStatus CopyFile(FileSystem* fs, const std::string& source, Temperature src_temp_hint, const std::string& destination, Temperature dst_temp, uint64_t size, bool use_fsync, - const std::shared_ptr& io_tracer); + const std::shared_ptr& io_tracer, + uint64_t max_read_buffer_size = 4096, + const std::optional& readIOOptions = {}, + const std::optional& writeIOOptions = {}); inline IOStatus CopyFile(const std::shared_ptr& fs, const std::string& source, Temperature src_temp_hint, const std::string& destination, Temperature dst_temp, uint64_t size, bool use_fsync, - const std::shared_ptr& io_tracer) { + const std::shared_ptr& io_tracer, + uint64_t max_read_buffer_size = 4096, + const std::optional& readIOOptions = {}, + const std::optional& writeIOOptions = {}) { return CopyFile(fs.get(), source, src_temp_hint, destination, dst_temp, size, - use_fsync, io_tracer); + use_fsync, io_tracer, max_read_buffer_size, readIOOptions, + writeIOOptions); } IOStatus CreateFile(FileSystem* fs, const std::string& destination, const std::string& contents, bool use_fsync); @@ -73,10 +83,18 @@ IOStatus GenerateOneFileChecksum( std::string* file_checksum_func_name, size_t verify_checksums_readahead_size, bool allow_mmap_reads, std::shared_ptr& io_tracer, RateLimiter* rate_limiter, - const ReadOptions& read_options, Statistics* stats, SystemClock* clock); + const ReadOptions& read_options, Statistics* stats, SystemClock* clock, + const FileOptions& file_options); inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, - SystemClock* clock, IOOptions& opts) { + SystemClock* clock, IOOptions& opts, + IODebugContext* dbg = nullptr) { + if (ro.request_id != nullptr) { + if (dbg != nullptr && dbg->request_id == nullptr) { + dbg->SetRequestId(ro.request_id); + } + } + if (ro.deadline.count()) { std::chrono::microseconds now = std::chrono::microseconds(clock->NowMicros()); diff --git a/file/filename.cc b/file/filename.cc index 45cbf9d76a98..d1d9c815a440 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -25,6 +25,7 @@ namespace ROCKSDB_NAMESPACE { const std::string kCurrentFileName = "CURRENT"; const std::string kOptionsFileNamePrefix = "OPTIONS-"; +const std::string kCompactionProgressFileNamePrefix = "COMPACTION_PROGRESS-"; const std::string kTempFileNameSuffix = "dbtmp"; static const std::string kRocksDbTFileExt = "sst"; @@ -242,6 +243,25 @@ std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) { return dbname + "/" + buffer; } +std::string CompactionProgressFileName(const std::string& dbname, + uint64_t timestamp) { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s%llu", + kCompactionProgressFileNamePrefix.c_str(), + static_cast(timestamp)); + return dbname + "/" + buffer; +} + +std::string TempCompactionProgressFileName(const std::string& dbname, + uint64_t timestamp) { + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%s%llu.%s", + kCompactionProgressFileNamePrefix.c_str(), + static_cast(timestamp), + kTempFileNameSuffix.c_str()); + return dbname + "/" + buffer; +} + std::string MetaDatabaseName(const std::string& dbname, uint64_t number) { char buf[100]; snprintf(buf, sizeof(buf), "/METADB-%llu", @@ -264,6 +284,8 @@ std::string IdentityFileName(const std::string& dbname) { // dbname/METADB-[0-9]+ // dbname/OPTIONS-[0-9]+ // dbname/OPTIONS-[0-9]+.dbtmp +// dbname/COMPACTION_PROGRESS-[timestamp] +// dbname/COMPACTION_PROGRESS-[timestamp].dbtmp // Disregards / at the beginning bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type, WalFileType* log_type) { @@ -339,6 +361,24 @@ bool ParseFileName(const std::string& fname, uint64_t* number, } *number = ts_suffix; *type = is_temp_file ? kTempFile : kOptionsFile; + } else if (rest.starts_with(kCompactionProgressFileNamePrefix)) { + uint64_t timestamp; + bool is_temp_file = false; + rest.remove_prefix(kCompactionProgressFileNamePrefix.size()); + const std::string kTempFileNameSuffixWithDot = + std::string(".") + kTempFileNameSuffix; + if (rest.ends_with(kTempFileNameSuffixWithDot)) { + rest.remove_suffix(kTempFileNameSuffixWithDot.size()); + is_temp_file = true; + } + if (!ConsumeDecimalNumber(&rest, ×tamp)) { + return false; + } + if (!rest.empty()) { + return false; + } + *number = timestamp; + *type = is_temp_file ? kTempFile : kCompactionProgressFile; } else { // Avoid strtoull() to keep filename format independent of the // current locale diff --git a/file/filename.h b/file/filename.h index 5a52c745ac6d..399a20f23cfa 100644 --- a/file/filename.h +++ b/file/filename.h @@ -124,7 +124,10 @@ std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, const std::string& log_dir = ""); extern const std::string kOptionsFileNamePrefix; // = "OPTIONS-" -extern const std::string kTempFileNameSuffix; // = "dbtmp" +extern const std::string + kCompactionProgressFileNamePrefix; // = + // "COMPACTION_PROGRESS-" +extern const std::string kTempFileNameSuffix; // = "dbtmp" // Return a options file name given the "dbname" and file number. // Format: OPTIONS-[number].dbtmp @@ -135,6 +138,16 @@ std::string OptionsFileName(uint64_t file_num); // Format: OPTIONS-[number] std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num); +// Return a compaction progress file name given the timestamp. +// Format: COMPACTION_PROGRESS-[timestamp] +std::string CompactionProgressFileName(const std::string& dbname, + uint64_t timestamp); + +// Return a temp compaction progress file name given the timestamp. +// Format: COMPACTION_PROGRESS-[timestamp].dbtmp +std::string TempCompactionProgressFileName(const std::string& dbname, + uint64_t timestamp); + // Return the name to use for a metadatabase. The result will be prefixed with // "dbname". std::string MetaDatabaseName(const std::string& dbname, uint64_t number); diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index 2c0919ed9522..57559b5e8466 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -299,9 +299,18 @@ TEST_P(PrefetchTest, Basic) { const uint64_t prev_table_open_prefetch_tail_hit = options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT); + HistogramData pre_compaction_prefetch_bytes; + options.statistics->histogramData(COMPACTION_PREFETCH_BYTES, + &pre_compaction_prefetch_bytes); + ASSERT_EQ(pre_compaction_prefetch_bytes.count, 0); + // commenting out the line below causes the example to work correctly ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + HistogramData post_compaction_prefetch_bytes; + options.statistics->histogramData(COMPACTION_PREFETCH_BYTES, + &post_compaction_prefetch_bytes); + HistogramData cur_table_open_prefetch_tail_read; options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, &cur_table_open_prefetch_tail_read); @@ -318,6 +327,7 @@ TEST_P(PrefetchTest, Basic) { ASSERT_GT(fs->GetPrefetchCount(), 1); ASSERT_EQ(0, buff_prefetch_count); fs->ClearPrefetchCount(); + ASSERT_EQ(post_compaction_prefetch_bytes.count, 0); } else { ASSERT_FALSE(fs->IsPrefetchCalled()); // To rule out false positive by the SST file tail prefetch during @@ -331,6 +341,20 @@ TEST_P(PrefetchTest, Basic) { prev_table_open_prefetch_tail_hit); ASSERT_GE(cur_table_open_prefetch_tail_miss, prev_table_open_prefetch_tail_miss); + + ASSERT_GT(post_compaction_prefetch_bytes.count, 0); + + // Not an exact match due to potential roundup/down for alignment + auto expected_compaction_readahead_size = + Options().compaction_readahead_size; + ASSERT_LE(post_compaction_prefetch_bytes.max, + expected_compaction_readahead_size * 1.1); + ASSERT_GE(post_compaction_prefetch_bytes.max, + expected_compaction_readahead_size * 0.9); + ASSERT_LE(post_compaction_prefetch_bytes.average, + expected_compaction_readahead_size * 1.1); + ASSERT_GE(post_compaction_prefetch_bytes.average, + expected_compaction_readahead_size * 0.9); } for (bool disable_io : {false, true}) { @@ -645,7 +669,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { MoveFilesToLevel(level); } Close(); - std::vector buff_prefectch_level_count = {0, 0, 0}; + std::vector buff_prefetch_level_count = {0, 0, 0}; ASSERT_OK(TryReopen(options)); { auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); @@ -683,7 +707,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { iter->Next(); } - buff_prefectch_level_count[level] = buff_prefetch_count; + buff_prefetch_level_count[level] = buff_prefetch_count; if (support_prefetch && !use_direct_io) { if (level == 0) { ASSERT_FALSE(fs->IsPrefetchCalled()); @@ -704,7 +728,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { } if (!support_prefetch) { - ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]); + ASSERT_GT(buff_prefetch_level_count[1], buff_prefetch_level_count[2]); } SyncPoint::GetInstance()->DisableProcessing(); @@ -790,7 +814,7 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { "{initial_auto_readahead_size=0;}"}})); break; case 1: - // intial_auto_readahead_size and max_auto_readahead_size are set + // initial_auto_readahead_size and max_auto_readahead_size are set // same so readahead_size remains same. ASSERT_OK(db_->SetOptions({{"block_based_table_factory", "{initial_auto_readahead_size=4096;max_" @@ -1057,7 +1081,7 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { } { /* - * Reesek keys from Single Data Block. + * Reseek keys from Single Data Block. */ auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); iter->Seek(BuildKey(0)); @@ -1092,9 +1116,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { ASSERT_TRUE(iter->Valid()); iter->Seek(BuildKey(1008)); ASSERT_TRUE(iter->Valid()); - iter->Seek( - BuildKey(996)); // Reseek won't prefetch any data and - // readahead_size will be initiallized to 8*1024. + iter->Seek(BuildKey(996)); // Reseek won't prefetch any data and + // readahead_size will be initialized to 8*1024. ASSERT_TRUE(iter->Valid()); iter->Seek(BuildKey(992)); ASSERT_TRUE(iter->Valid()); @@ -1566,7 +1589,7 @@ INSTANTIATE_TEST_CASE_P( ::testing::Combine( // Params are as follows - // Param 0 - TableOptions::index_shortening - // Param 2 - ReadOptinos::auto_readahead_size + // Param 2 - ReadOptions::auto_readahead_size ::testing::Values( BlockBasedTableOptions::IndexShorteningMode::kNoShortening, BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators, @@ -2494,6 +2517,187 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) { Close(); } +TEST_P(PrefetchTest1, PollErrorRecoveryDuringIteration) { + // This end-to-end test verifies that Poll() errors during async prefetching + // are properly propagated to the iterator. When Poll() fails, the iterator + // should stop and return an IOError status. + // + // With error injection on the 3rd Poll call, the iterator reads ~231 keys + // (out of 500) before encountering the error. + + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + + const int kNumKeys = 500; + std::shared_ptr fs = std::make_shared( + FileSystem::Default(), /*support_prefetch=*/false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + bool use_direct_io = GetParam(); + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + ROCKSDB_GTEST_SKIP("Direct IO not supported"); + return; + } + ASSERT_OK(s); + + // Write keys with known values so we can verify correctness + std::map expected_data; + { + WriteBatch batch; + for (int i = 0; i < kNumKeys; i++) { + std::string key = BuildKey(i); + std::string value = "value_" + std::to_string(i) + "_" + + std::string(100, 'x'); // Make values ~110 bytes + ASSERT_OK(batch.Put(key, value)); + expected_data[key] = value; + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + } + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + // Set up callbacks to track async IO and inject Poll errors + std::atomic poll_call_count{0}; + std::atomic poll_error_injected_count{0}; + bool read_async_called = false; + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PollIfNeeded:IOStatus", [&](void* arg) { + poll_call_count++; + int current_count = poll_call_count.load(); + + // Inject error on the third Poll call to allow some keys to be read + // first + if (current_count == 3) { + IOStatus* io_s = static_cast(arg); + *io_s = IOStatus::IOError("Injected Poll error for e2e testing"); + poll_error_injected_count++; + std::cout << "PollErrorRecoveryDuringIteration: Injected error on " + "Poll call #" + << current_count << std::endl; + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + + SyncPoint::GetInstance()->EnableProcessing(); + + // Iterate through all keys with async IO enabled + ReadOptions ro; + ro.async_io = true; + ro.adaptive_readahead = true; + + int keys_read = 0; + int data_mismatches = 0; + Status iter_status; + { + auto iter = std::unique_ptr(db_->NewIterator(ro)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string key = iter->key().ToString(); + std::string value = iter->value().ToString(); + + auto it = expected_data.find(key); + if (it == expected_data.end()) { + std::cout << "PollErrorRecoveryDuringIteration: Unexpected key: " << key + << std::endl; + data_mismatches++; + } else if (it->second != value) { + std::cout << "PollErrorRecoveryDuringIteration: Value mismatch for key " + << key << std::endl; + data_mismatches++; + } + keys_read++; + } + iter_status = iter->status(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Log results + std::cout << "PollErrorRecoveryDuringIteration: " << "read_async_called=" + << read_async_called << ", poll_calls=" << poll_call_count.load() + << ", poll_errors_injected=" << poll_error_injected_count.load() + << ", keys_read=" << keys_read << ", expected_keys=" << kNumKeys + << ", data_mismatches=" << data_mismatches + << ", iter_status=" << iter_status.ToString() << std::endl; + + // Verify no data mismatches occurred for keys that were read + ASSERT_EQ(data_mismatches, 0) + << "Found " << data_mismatches << " data mismatches"; + + if (read_async_called) { + // Async IO was used - verify Poll error was injected and propagated + ASSERT_EQ(poll_call_count.load(), 3) + << "Expected exactly 3 Poll calls when error injected on 3rd call"; + ASSERT_EQ(poll_error_injected_count.load(), 1) + << "Expected exactly 1 Poll error to be injected"; + + // The iterator should have stopped with an error status + ASSERT_TRUE(iter_status.IsIOError()) + << "Expected iterator to report IOError after Poll failure, got: " + << iter_status.ToString(); + + std::cout << "PollErrorRecoveryDuringIteration: Successfully verified " + "Poll error was injected and propagated to iterator" + << std::endl; + } else { + // Async IO not supported - iterator should complete successfully + ASSERT_OK(iter_status); + ASSERT_EQ(keys_read, kNumKeys); + std::cout << "PollErrorRecoveryDuringIteration: Async IO (io_uring) not " + "supported on this platform, verified data correctness" + << std::endl; + } + + // Retry iteration without error injection - verify all data is still readable + // This confirms the Poll error didn't corrupt state + { + int retry_keys_read = 0; + int retry_data_mismatches = 0; + auto iter = std::unique_ptr(db_->NewIterator(ro)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string key = iter->key().ToString(); + std::string value = iter->value().ToString(); + + auto it = expected_data.find(key); + if (it == expected_data.end()) { + retry_data_mismatches++; + } else if (it->second != value) { + retry_data_mismatches++; + } + retry_keys_read++; + } + ASSERT_OK(iter->status()) + << "Retry iteration failed: " << iter->status().ToString(); + ASSERT_EQ(retry_keys_read, kNumKeys) + << "Retry should read all " << kNumKeys << " keys"; + ASSERT_EQ(retry_data_mismatches, 0) + << "Retry found " << retry_data_mismatches << " data mismatches"; + std::cout << "PollErrorRecoveryDuringIteration: Retry succeeded, read all " + << retry_keys_read << " keys correctly" << std::endl; + } + + Close(); +} + namespace { #ifdef GFLAGS const int kMaxArgCount = 100; @@ -3251,8 +3455,9 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { ReadaheadParams readahead_params; readahead_params.initial_readahead_size = 8192; readahead_params.max_readahead_size = 8192; - FilePrefetchBuffer fpb(readahead_params, true, false, fs(), nullptr, - stats.get()); + FilePrefetchBuffer fpb( + readahead_params, true, false, fs(), nullptr, stats.get(), + nullptr /* cb */, FilePrefetchBufferUsage::kUserScanPrefetch /* usage */); Slice result; // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings, // it will do a read of offset 0 and length - (4096 + 8192) 12288. @@ -3278,7 +3483,7 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_HITS), 1); ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_BYTES_USEFUL), 8192); - // Now read some data with length doesn't align with aligment and it needs + // Now read some data with length doesn't align with alignment and it needs // prefetching. Read from 16000 with length 10000 (i.e. requested end offset - // 26000). ASSERT_TRUE( @@ -3352,6 +3557,118 @@ TEST_F(FilePrefetchBufferTest, ForCompaction) { 0); } +TEST_F(FilePrefetchBufferTest, PollErrorPropagation) { + // This test verifies that Poll() errors in PollIfNeeded are properly + // propagated rather than being silently ignored. + + std::string fname = "poll-error-test"; + Random rand(0); + std::string content = rand.RandomString(32768); + Write(fname, content); + + FileOptions opts; + std::unique_ptr r; + Read(fname, opts, &r); + + // Set up readahead params for async prefetching + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = 16384; + readahead_params.max_readahead_size = 16384; + + FilePrefetchBuffer fpb(readahead_params, /*enable=*/true, + /*track_min_offset=*/false, fs()); + + Slice result; + // Start an async prefetch to set up async_read_in_progress_ state + Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result); + + // Skip test on platforms that don't support async IO. + if (s.IsNotSupported()) { + ROCKSDB_GTEST_SKIP("Async IO not supported on this platform"); + return; + } + ASSERT_TRUE(s.IsTryAgain()); + + // With the ReadAsync sync fallback, PrefetchAsync returns TryAgain even when + // async IO is unavailable (data is read synchronously, but data_found was + // false at entry). Detect by checking async_read_in_progress_ on the buffer. + { + std::vector> buf_info(1); + fpb.TEST_GetBufferOffsetandSize(buf_info); + bool async_read_in_progress = std::get<2>(buf_info[0]); + if (!async_read_in_progress) { + ROCKSDB_GTEST_SKIP("Async IO not available (sync fallback used)"); + return; + } + } + + // Set up SyncPoint to inject Poll error + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PollIfNeeded:IOStatus", [&](void* arg) { + IOStatus* io_s = static_cast(arg); + *io_s = IOStatus::IOError("Injected Poll error for testing"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // TryReadFromCache will call PollIfNeeded to complete the async read + IOOptions io_opts; + io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; + Status read_status; + bool found = + fpb.TryReadFromCache(io_opts, r.get(), 0, 4096, &result, &read_status); + + // When PollIfNeeded fails: + // 1. PrefetchInternal returns the error status + // 2. TryReadFromCacheUntracked sets *status to the error and returns false + // Therefore: found should be false, and read_status should contain the error + ASSERT_FALSE(found) << "Expected TryReadFromCache to return false on Poll " + "error, but it returned true"; + ASSERT_TRUE(read_status.IsIOError()) + << "Expected IOError status, got: " << read_status.ToString(); + ASSERT_TRUE(read_status.ToString().find("Injected Poll error") != + std::string::npos) + << "Expected error message to contain 'Injected Poll error', got: " + << read_status.ToString(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(FilePrefetchBufferTest, ReadAsyncSyncFallbackOnNotSupported) { + std::string fname = "read-async-sync-fallback"; + Random rand(0); + std::string content = rand.RandomString(32768); + Write(fname, content); + + FileOptions opts; + std::unique_ptr r; + Read(fname, opts, &r); + + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::ReadAsync:InjectStatus", [](void* arg) { + *static_cast(arg) = IOStatus::NotSupported(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ReadaheadParams readahead_params; + readahead_params.initial_readahead_size = 16384; + readahead_params.max_readahead_size = 16384; + readahead_params.num_buffers = 2; + + FilePrefetchBuffer fpb(readahead_params, /*enable=*/true, + /*track_min_offset=*/false, fs()); + + Slice result; + Status s; + ASSERT_TRUE(fpb.TryReadFromCache(IOOptions(), r.get(), 0, 4096, &result, &s)); + ASSERT_OK(s); + ASSERT_EQ(result.size(), 4096); + ASSERT_EQ(memcmp(result.data(), content.data(), 4096), 0); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + class FSBufferPrefetchTest : public testing::Test, public ::testing::WithParamInterface> { @@ -3497,9 +3814,10 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) { size_t num_buffers = use_async_prefetch ? 2 : 1; readahead_params.num_buffers = num_buffers; - FilePrefetchBuffer fpb(readahead_params, true /* enable */, - false /* track_min_offset */, fs(), clock(), - stats.get()); + FilePrefetchBuffer fpb( + readahead_params, true /* enable */, false /* track_min_offset */, fs(), + clock(), stats.get(), nullptr /* cb */, + FilePrefetchBufferUsage::kUserScanPrefetch /* usage */); int overlap_buffer_write_ct = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -3516,6 +3834,9 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) { fpb.TryReadFromCache(IOOptions(), r.get(), 0 /* offset */, 4096 /* n */, &result, &s, for_compaction); // Platforms that don't have IO uring may not support async IO. + // With the ReadAsync sync fallback, s will be OK even when async IO is + // unavailable — detect by checking if the second buffer has an async read + // in progress. if (use_async_prefetch && s.IsNotSupported()) { return; } @@ -3529,6 +3850,14 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) { fpb.TEST_GetOverlapBufferOffsetandSize(overlap_buffer_info); fpb.TEST_GetBufferOffsetandSize(buffer_info); if (use_async_prefetch) { + bool async_read_in_progress = std::get<2>(buffer_info[1]); + if (!async_read_in_progress) { + // Async IO was requested but not available (e.g., no io_uring). + // ReadAsync fell back to sync read. Skip async-specific assertions. + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + return; + } // Cut the readahead of 8192 in half. // Overlap buffer is not used ASSERT_EQ(overlap_buffer_info.first, 0); @@ -3721,6 +4050,14 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchUnalignedReads) { fpb.TEST_GetOverlapBufferOffsetandSize(overlap_buffer_info); fpb.TEST_GetBufferOffsetandSize(buffer_info); if (use_async_prefetch) { + bool async_read_in_progress = std::get<2>(buffer_info[1]); + if (!async_read_in_progress) { + // Async IO was requested but not available (e.g., no io_uring). + // ReadAsync fell back to sync read. Skip async-specific assertions. + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + return; + } // Overlap buffer is not used ASSERT_EQ(overlap_buffer_info.first, 0); ASSERT_EQ(overlap_buffer_info.second, 0); diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 46f5d1c26262..ae070ef34626 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -66,29 +66,53 @@ inline void RecordIOStats(Statistics* stats, Temperature file_temperature, } // record for temperature file - if (file_temperature != Temperature::kUnknown) { - switch (file_temperature) { - case Temperature::kHot: - IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size); - IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1); - RecordTick(stats, HOT_FILE_READ_BYTES, size); - RecordTick(stats, HOT_FILE_READ_COUNT, 1); - break; - case Temperature::kWarm: - IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size); - IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1); - RecordTick(stats, WARM_FILE_READ_BYTES, size); - RecordTick(stats, WARM_FILE_READ_COUNT, 1); - break; - case Temperature::kCold: - IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size); - IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1); - RecordTick(stats, COLD_FILE_READ_BYTES, size); - RecordTick(stats, COLD_FILE_READ_COUNT, 1); - break; - default: - break; - } + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1); + RecordTick(stats, HOT_FILE_READ_BYTES, size); + RecordTick(stats, HOT_FILE_READ_COUNT, 1); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1); + RecordTick(stats, WARM_FILE_READ_BYTES, size); + RecordTick(stats, WARM_FILE_READ_COUNT, 1); + break; + case Temperature::kCool: + IOSTATS_ADD(file_io_stats_by_temperature.cool_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.cool_file_read_count, 1); + RecordTick(stats, COOL_FILE_READ_BYTES, size); + RecordTick(stats, COOL_FILE_READ_COUNT, 1); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1); + RecordTick(stats, COLD_FILE_READ_BYTES, size); + RecordTick(stats, COLD_FILE_READ_COUNT, 1); + break; + case Temperature::kIce: + IOSTATS_ADD(file_io_stats_by_temperature.ice_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.ice_file_read_count, 1); + RecordTick(stats, ICE_FILE_READ_BYTES, size); + RecordTick(stats, ICE_FILE_READ_COUNT, 1); + break; + case Temperature::kUnknown: + if (is_last_level) { + IOSTATS_ADD(file_io_stats_by_temperature.unknown_last_level_bytes_read, + size); + IOSTATS_ADD(file_io_stats_by_temperature.unknown_last_level_read_count, + 1); + } else { + IOSTATS_ADD( + file_io_stats_by_temperature.unknown_non_last_level_bytes_read, + size); + IOSTATS_ADD( + file_io_stats_by_temperature.unknown_non_last_level_read_count, 1); + } + break; + default: + break; } } @@ -106,11 +130,14 @@ IOStatus RandomAccessFileReader::Create( IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result, char* scratch, - AlignedBuf* aligned_buf) const { + AlignedBuf* aligned_buf, + IODebugContext* dbg) const { (void)aligned_buf; const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority; TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr); + TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read:IODebugContext", + const_cast(static_cast(dbg))); // To be paranoid: modify scratch a little bit, so in case underlying // FileSystem doesn't fill the buffer but return success and `scratch` returns @@ -175,7 +202,7 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, // the opts.timeout before calling file_->Read assert(!opts.timeout.count() || allowed == read_size); io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts, - &tmp, buf.Destination(), nullptr); + &tmp, buf.Destination(), dbg); } if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); @@ -237,7 +264,7 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, // the opts.timeout before calling file_->Read assert(!opts.timeout.count() || allowed == n); io_s = file_->Read(offset + pos, allowed, opts, &tmp_result, - scratch + pos, nullptr); + scratch + pos, dbg); } if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); @@ -311,7 +338,8 @@ bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) { IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, FSReadRequest* read_reqs, size_t num_reqs, - AlignedBuf* aligned_buf) const { + AlignedBuf* aligned_buf, + IODebugContext* dbg) const { (void)aligned_buf; // suppress warning of unused variable in LITE mode assert(num_reqs > 0); @@ -420,8 +448,10 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, remaining_bytes -= request_bytes; } } - io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, - /*IODebugContext*=*/nullptr); + TEST_SYNC_POINT_CALLBACK( + "RandomAccessFileReader::MultiRead:IODebugContext", + const_cast(static_cast(dbg))); + io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, dbg); RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs); } @@ -475,19 +505,34 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, } IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro, - IOOptions& opts) const { + IOOptions& opts, + IODebugContext* dbg) const { if (clock_ != nullptr) { - return PrepareIOFromReadOptions(ro, clock_, opts); + return PrepareIOFromReadOptions(ro, clock_, opts, dbg); } else { - return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts); + return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts, + dbg); } } +// Notes for when direct_io is enabled: +// Unless req.offset, req.len, req.scratch are all already aligned, +// RandomAccessFileReader will creats aligned requests and aligned buffer for +// the request. User should only provide either req.scratch or aligned_buf. If +// only req.scratch is provided, result will be copied from allocated aligned +// buffer to req.scratch. If only alignd_buf is provided, it will be set to +// the ailgned buf allocated by RandomAccessFileReader and saves a copy. IOStatus RandomAccessFileReader::ReadAsync( FSReadRequest& req, const IOOptions& opts, std::function cb, void* cb_arg, - void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) { + void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf, + IODebugContext* dbg) { IOStatus s; + TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::ReadAsync:InjectStatus", + &s); + if (!s.ok()) { + return s; + } // Create a callback and populate info. auto read_async_callback = std::bind(&RandomAccessFileReader::ReadAsyncCallback, this, @@ -532,14 +577,14 @@ IOStatus RandomAccessFileReader::ReadAsync( (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); s = file_->ReadAsync(aligned_req, opts, read_async_callback, - read_async_info, io_handle, del_fn, nullptr /*dbg*/); + read_async_info, io_handle, del_fn, dbg); } else { StopWatch sw(clock_, stats_, hist_type_, GetFileReadHistograms(stats_, opts.io_activity), (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); s = file_->ReadAsync(req, opts, read_async_callback, read_async_info, - io_handle, del_fn, nullptr /*dbg*/); + io_handle, del_fn, dbg); } RecordTick(stats_, READ_ASYNC_MICROS, elapsed); diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index 945e685e3d00..c1de6b973f44 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -164,7 +164,8 @@ class RandomAccessFileReader { // the internally allocated buffer on return, and the result refers to a // region in aligned_buf. IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result, - char* scratch, AlignedBuf* aligned_buf) const; + char* scratch, AlignedBuf* aligned_buf, + IODebugContext* dbg = nullptr) const; // REQUIRES: // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing. @@ -172,10 +173,12 @@ class RandomAccessFileReader { // In direct IO mode, aligned_buf stores the aligned buffer allocated inside // MultiRead, the result Slices in reqs refer to aligned_buf. IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs, - size_t num_reqs, AlignedBuf* aligned_buf) const; + size_t num_reqs, AlignedBuf* aligned_buf, + IODebugContext* dbg = nullptr) const; - IOStatus Prefetch(const IOOptions& opts, uint64_t offset, size_t n) const { - return file_->Prefetch(offset, n, opts, nullptr); + IOStatus Prefetch(const IOOptions& opts, uint64_t offset, size_t n, + IODebugContext* dbg = nullptr) const { + return file_->Prefetch(offset, n, opts, dbg); } FSRandomAccessFile* file() { return file_.get(); } @@ -184,12 +187,13 @@ class RandomAccessFileReader { bool use_direct_io() const { return file_->use_direct_io(); } - IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts) const; + IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts, + IODebugContext* dbg = nullptr) const; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, - AlignedBuf* aligned_buf); + AlignedBuf* aligned_buf, IODebugContext* dbg = nullptr); void ReadAsyncCallback(FSReadRequest& req, void* cb_arg); }; diff --git a/file/random_access_file_reader_test.cc b/file/random_access_file_reader_test.cc index f081795b9d1f..717e985f1adb 100644 --- a/file/random_access_file_reader_test.cc +++ b/file/random_access_file_reader_test.cc @@ -147,8 +147,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { reqs.push_back(std::move(r0)); reqs.push_back(std::move(r1)); AlignedBuf aligned_buf; - ASSERT_OK( - r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); + IODebugContext dbg; + ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf, + &dbg)); AssertResult(content, reqs); @@ -192,8 +193,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { reqs.push_back(std::move(r1)); reqs.push_back(std::move(r2)); AlignedBuf aligned_buf; - ASSERT_OK( - r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); + IODebugContext dbg; + ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf, + &dbg)); AssertResult(content, reqs); @@ -237,8 +239,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { reqs.push_back(std::move(r1)); reqs.push_back(std::move(r2)); AlignedBuf aligned_buf; - ASSERT_OK( - r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); + IODebugContext dbg; + ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf, + &dbg)); AssertResult(content, reqs); @@ -274,8 +277,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { reqs.push_back(std::move(r0)); reqs.push_back(std::move(r1)); AlignedBuf aligned_buf; - ASSERT_OK( - r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); + IODebugContext dbg; + ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf, + &dbg)); AssertResult(content, reqs); diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc index dd09822e3e23..004f2ab746ba 100644 --- a/file/readahead_raf.cc +++ b/file/readahead_raf.cc @@ -108,6 +108,10 @@ class ReadaheadRandomAccessFile : public FSRandomAccessFile { bool use_direct_io() const override { return file_->use_direct_io(); } + IOStatus GetFileSize(uint64_t* result) override { + return file_->GetFileSize(result); + } + private: // Tries to read from buffer_ n bytes starting at offset. If anything was read // from the cache, it sets cached_len to the number of bytes actually read, diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h index 96ec271eee37..b98d8594e851 100644 --- a/file/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -162,7 +162,6 @@ class SstFileManagerImpl : public SstFileManager { void Close(); void SetStatisticsPtr(const std::shared_ptr& stats) override { - stats_ = stats; delete_scheduler_.SetStatisticsPtr(stats); } @@ -216,7 +215,6 @@ class SstFileManagerImpl : public SstFileManager { std::list error_handler_list_; // Pointer to ErrorHandler instance that is currently processing recovery ErrorHandler* cur_instance_; - std::shared_ptr stats_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc index 41e3b582afa4..2a92c0754dcd 100644 --- a/file/writable_file_writer.cc +++ b/file/writable_file_writer.cc @@ -204,13 +204,14 @@ IOStatus WritableFileWriter::Append(const IOOptions& opts, const Slice& data, return s; } -IOStatus WritableFileWriter::Pad(const IOOptions& opts, - const size_t pad_bytes) { +IOStatus WritableFileWriter::Pad(const IOOptions& opts, const size_t pad_bytes, + const size_t max_pad_size) { + (void)max_pad_size; if (seen_error()) { return GetWriterHasPreviousErrorStatus(); } const IOOptions io_options = FinalizeIOOptions(opts); - assert(pad_bytes < kDefaultPageSize); + assert(pad_bytes < max_pad_size); size_t left = pad_bytes; size_t cap = buf_.Capacity() - buf_.CurrentSize(); @@ -687,9 +688,9 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum(const IOOptions& opts, if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) { while (data_size > 0) { size_t tmp_size; - tmp_size = rate_limiter_->RequestToken(data_size, buf_.Alignment(), - rate_limiter_priority_used, stats_, - RateLimiter::OpType::kWrite); + tmp_size = + rate_limiter_->RequestToken(data_size, 0, rate_limiter_priority_used, + stats_, RateLimiter::OpType::kWrite); data_size -= tmp_size; } } diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h index b880e1f216b2..619821204b3e 100644 --- a/file/writable_file_writer.h +++ b/file/writable_file_writer.h @@ -256,7 +256,8 @@ class WritableFileWriter { IOStatus Append(const IOOptions& opts, const Slice& data, uint32_t crc32c_checksum = 0); - IOStatus Pad(const IOOptions& opts, const size_t pad_bytes); + IOStatus Pad(const IOOptions& opts, const size_t pad_bytes, + const size_t max_pad_size); IOStatus Flush(const IOOptions& opts); diff --git a/folly.mk b/folly.mk new file mode 100644 index 000000000000..69f99b91a9aa --- /dev/null +++ b/folly.mk @@ -0,0 +1,165 @@ +# This file contains the vast majority of folly-related build configuration +# for the checkout_folly and build_folly targets, so that this file can be +# hashed for purposes of caching folly builds and not hitting that cache when +# something here changes. + +# This provides a Makefile simulation of a Meta-internal folly integration. +# It is not validated for general use. +# +# USE_FOLLY links the build targets with libfolly.a. The latter could be +# built using 'make build_folly', or built externally and specified in +# the CXXFLAGS and EXTRA_LDFLAGS env variables. The build_detect_platform +# script tries to detect if an external folly dependency has been specified. +# If not, it exports FOLLY_PATH to the path of the installed Folly and +# dependency libraries. +# +# USE_FOLLY_LITE cherry picks source files from Folly to include in the +# RocksDB library. Its faster and has fewer dependencies on 3rd party +# libraries, but with limited functionality. For example, coroutine +# functionality is not available. +ifeq ($(USE_FOLLY),1) +ifeq ($(USE_FOLLY_LITE),1) +$(error Please specify only one of USE_FOLLY and USE_FOLLY_LITE) +endif +ifneq ($(strip $(FOLLY_PATH)),) + BOOST_PATH = $(shell (ls -d $(FOLLY_PATH)/../boost*)) + DBL_CONV_PATH = $(shell (ls -d $(FOLLY_PATH)/../double-conversion*)) + GFLAGS_PATH = $(shell (ls -d $(FOLLY_PATH)/../gflags*)) + GLOG_PATH = $(shell (ls -d $(FOLLY_PATH)/../glog*)) + LIBEVENT_PATH = $(shell (ls -d $(FOLLY_PATH)/../libevent*)) + XZ_PATH = $(shell (ls -d $(FOLLY_PATH)/../xz*)) + LIBSODIUM_PATH = $(shell (ls -d $(FOLLY_PATH)/../libsodium*)) + FMT_PATH = $(shell (ls -d $(FOLLY_PATH)/../fmt*)) + + # For some reason, glog and fmt libraries are under either lib or lib64 + GLOG_LIB_PATH = $(shell (ls -d $(GLOG_PATH)/lib*)) + FMT_LIB_PATH = $(shell (ls -d $(FMT_PATH)/lib*)) + + # AIX: pre-defined system headers are surrounded by an extern "C" block + ifeq ($(PLATFORM), OS_AIX) + PLATFORM_CCFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include + PLATFORM_CXXFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include + else + PLATFORM_CCFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include + PLATFORM_CXXFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include + endif + + # Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later + # in the command line + + PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(LIBEVENT_PATH)/lib/libevent.a $(LIBSODIUM_PATH)/lib/libsodium.a -ldl +ifneq ($(DEBUG_LEVEL),0) + PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmtd.a $(GLOG_LIB_PATH)/libglogd.so $(GFLAGS_PATH)/lib/libgflags_debug.so.2.2 +else + PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2 +endif + PLATFORM_LDFLAGS += -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(GFLAGS_PATH)/lib +endif + PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG + PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG +endif + +ifeq ($(USE_FOLLY_LITE),1) + # Path to the Folly source code and include files + FOLLY_DIR = ./third-party/folly +ifneq ($(strip $(BOOST_SOURCE_PATH)),) + BOOST_INCLUDE = $(shell (ls -d $(BOOST_SOURCE_PATH)/boost*/)) + # AIX: pre-defined system headers are surrounded by an extern "C" block + ifeq ($(PLATFORM), OS_AIX) + PLATFORM_CCFLAGS += -I$(BOOST_INCLUDE) + PLATFORM_CXXFLAGS += -I$(BOOST_INCLUDE) + else + PLATFORM_CCFLAGS += -isystem $(BOOST_INCLUDE) + PLATFORM_CXXFLAGS += -isystem $(BOOST_INCLUDE) + endif +endif # BOOST_SOURCE_PATH +ifneq ($(strip $(FMT_SOURCE_PATH)),) + FMT_INCLUDE = $(shell (ls -d $(FMT_SOURCE_PATH)/fmt*/include/)) + # AIX: pre-defined system headers are surrounded by an extern "C" block + ifeq ($(PLATFORM), OS_AIX) + PLATFORM_CCFLAGS += -I$(FMT_INCLUDE) + PLATFORM_CXXFLAGS += -I$(FMT_INCLUDE) + else + PLATFORM_CCFLAGS += -isystem $(FMT_INCLUDE) + PLATFORM_CXXFLAGS += -isystem $(FMT_INCLUDE) + endif +endif # FMT_SOURCE_PATH + # AIX: pre-defined system headers are surrounded by an extern "C" block + ifeq ($(PLATFORM), OS_AIX) + PLATFORM_CCFLAGS += -I$(FOLLY_DIR) + PLATFORM_CXXFLAGS += -I$(FOLLY_DIR) + else + PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR) + PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR) + endif + PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG + PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG +# TODO: fix linking with fbcode compiler config + PLATFORM_LDFLAGS += -lglog +endif + +FOLLY_COMMIT_HASH = 1e8ce1e5d35acff7b78fedbca3e7311b39f43529 + +# For public CI runs, checkout folly in a way that can build with RocksDB. +# This is mostly intended as a test-only simulation of Meta-internal folly +# integration. +checkout_folly: + if [ -e third-party/folly ]; then \ + cd third-party/folly && ${GIT_COMMAND} fetch origin; \ + else \ + cd third-party && ${GIT_COMMAND} clone https://github.com/facebook/folly.git; \ + fi + @# Pin to a particular version for public CI, so that PR authors don't + @# need to worry about folly breaking our integration. Update periodically + cd third-party/folly && git reset --hard $(FOLLY_COMMIT_HASH) + @# Apparently missing include + perl -pi -e 's/(#include )/$$1\n#include /' third-party/folly/folly/lang/Exception.h + @# const mismatch + perl -pi -e 's/: environ/: (const char**)(environ)/' third-party/folly/folly/Subprocess.cpp + @# Restore cached downloads and handle unreliable mirrors with fallback + @cd third-party/folly && \ + DOWNLOAD_DIR=`$(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir | sed 's|/installed/.*|/downloads|'` && \ + mkdir -p "$$DOWNLOAD_DIR" && \ + CACHE_DIR="/tmp/rocksdb-getdeps-cache" && \ + mkdir -p "$$CACHE_DIR" && \ + echo "Restoring cached downloads..." && \ + if ls "$$CACHE_DIR"/*.tar.gz "$$CACHE_DIR"/*.tar.xz "$$CACHE_DIR"/*.zip >/dev/null 2>&1; then \ + cp -n "$$CACHE_DIR"/*.tar.gz "$$CACHE_DIR"/*.tar.xz "$$CACHE_DIR"/*.zip "$$DOWNLOAD_DIR/" 2>/dev/null || true; \ + fi && \ + echo "Handling known unreliable downloads with fallback mirrors..." && \ + $(PYTHON) ../../build_tools/getdeps_fallback_mirror.py "$$DOWNLOAD_DIR" "$$CACHE_DIR" build/fbcode_builder/manifests + @# NOTE: boost and fmt source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on those headers + cd third-party/folly && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch boost && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch fmt + @# Update cache with any new downloads + @cd third-party/folly && \ + DOWNLOAD_DIR=`$(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir | sed 's|/installed/.*|/downloads|'` && \ + CACHE_DIR="/tmp/rocksdb-getdeps-cache" && \ + if ls "$$DOWNLOAD_DIR"/*.tar.gz "$$DOWNLOAD_DIR"/*.tar.xz "$$DOWNLOAD_DIR"/*.zip >/dev/null 2>&1; then \ + cp -n "$$DOWNLOAD_DIR"/*.tar.gz "$$DOWNLOAD_DIR"/*.tar.xz "$$DOWNLOAD_DIR"/*.zip "$$CACHE_DIR/" 2>/dev/null || true; \ + fi + +CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS)) + +FOLLY_BUILD_FLAGS = --no-tests +# NOTE: To avoid ODR violations, we must build folly in debug mode iff +# building RocksDB in debug mode. +ifneq ($(DEBUG_LEVEL),0) +FOLLY_BUILD_FLAGS += --build-type Debug +endif + +build_folly: + FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \ + if [ "$$FOLLY_INST_PATH" ]; then \ + rm -rf $${FOLLY_INST_PATH}/../../*; \ + else \ + echo "Please run checkout_folly first"; \ + false; \ + fi + cd third-party/folly && \ + CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py build $(FOLLY_BUILD_FLAGS) + @# In the folly build, glog and gflags are only built as dynamic libraries, + @# not static. This patchelf command is needed to reliably have the glog + @# library find its dependency gflags, because apparently the rpath of the + @# final binary is not used in resolving that transitive dependency. + FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \ + cd "$$FOLLY_INST_PATH" && patchelf --add-rpath $$PWD/../gflags-*/lib ../glog-*/lib*/libglog*.so.*.*.* diff --git a/fuzz/db_fuzzer.cc b/fuzz/db_fuzzer.cc index e6d5bb63c06f..7b10b35ce101 100644 --- a/fuzz/db_fuzzer.cc +++ b/fuzz/db_fuzzer.cc @@ -31,11 +31,11 @@ constexpr char db_path[] = "/tmp/testdb"; // enum. The goal is to capture sanitizer bugs, so the code should be // compiled with a given sanitizer (ASan, UBSan, MSan). extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - ROCKSDB_NAMESPACE::DB* db; + std::unique_ptr db; ROCKSDB_NAMESPACE::Options options; + ROCKSDB_NAMESPACE::Status status; options.create_if_missing = true; - ROCKSDB_NAMESPACE::Status status = - ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db); + status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db); if (!status.ok()) { return 0; } @@ -88,7 +88,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { } case kOpenClose: { db->Close(); - delete db; + db.reset(); status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db); if (!status.ok()) { ROCKSDB_NAMESPACE::DestroyDB(db_path, options); @@ -104,7 +104,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { "new_cf", &cf); s = db->DestroyColumnFamilyHandle(cf); db->Close(); - delete db; + db.reset(); // open DB with two column families std::vector column_families; @@ -166,7 +166,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { // Cleanup DB db->Close(); - delete db; + db.reset(); ROCKSDB_NAMESPACE::DestroyDB(db_path, options); return 0; } diff --git a/fuzz/db_map_fuzzer.cc b/fuzz/db_map_fuzzer.cc index ed9df8f8432d..8c55ac4e9e7a 100644 --- a/fuzz/db_map_fuzzer.cc +++ b/fuzz/db_map_fuzzer.cc @@ -50,7 +50,7 @@ DEFINE_PROTO_FUZZER(DBOperations& input) { } std::map kv; - ROCKSDB_NAMESPACE::DB* db = nullptr; + std::unique_ptr db; ROCKSDB_NAMESPACE::Options options; options.create_if_missing = true; CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db)); @@ -86,8 +86,7 @@ DEFINE_PROTO_FUZZER(DBOperations& input) { } } CHECK_OK(db->Close()); - delete db; - db = nullptr; + db.reset(); CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db)); auto kv_it = kv.begin(); @@ -102,6 +101,6 @@ DEFINE_PROTO_FUZZER(DBOperations& input) { delete it; CHECK_OK(db->Close()); - delete db; + db.reset(); CHECK_OK(ROCKSDB_NAMESPACE::DestroyDB(kDbPath, options)); } diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc index 676daf574fa4..ae17f64cd2fb 100644 --- a/fuzz/sst_file_writer_fuzzer.cc +++ b/fuzz/sst_file_writer_fuzzer.cc @@ -91,7 +91,8 @@ TableReader* NewTableReader(const std::string& sst_file_path, } if (s.ok()) { ImmutableOptions iopts(options, cf_ioptions); - TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options, + TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, + /*compression_manager=*/nullptr, env_options, cf_ioptions.internal_comparator, 0 /* block_protection_bytes_per_key */); t_opt.largest_seqno = kMaxSequenceNumber; diff --git a/include/rocksdb/advanced_cache.h b/include/rocksdb/advanced_cache.h index d8eeb7d2e381..8142228205e4 100644 --- a/include/rocksdb/advanced_cache.h +++ b/include/rocksdb/advanced_cache.h @@ -318,7 +318,7 @@ class Cache : public Customizable { // REQUIRES: handle must have been returned by a method on *this. virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0; - // Return the object assiciated with a handle returned by a successful + // Return the object associated with a handle returned by a successful // Lookup(). For historical reasons, this is also known at the "value" // associated with the key. // REQUIRES: handle must not have been released yet. diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h new file mode 100644 index 000000000000..a680d870464f --- /dev/null +++ b/include/rocksdb/advanced_compression.h @@ -0,0 +1,699 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// APIs for customizing compression in RocksDB. +// +// *********************************************************************** +// EXPERIMENTAL - subject to change while under development +// *********************************************************************** + +#pragma once + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/data_structure.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO: alias/adapt for compression +struct FilterBuildingContext; +class Decompressor; + +// A Compressor represents a very specific but potentially adapting strategy for +// compressing blocks, including the relevant algorithm(s), options, dictionary, +// etc. as applicable--every input except the sequence of bytes to compress. +// Compressor is generally thread-safe so can be shared by multiple threads. (It +// could make sense to convert unique_ptr to +// shared_ptr.) A Compressor for data files is expected to be used +// for just one file, so that compression strategy can be explicitly +// reconsidered for each new file. However, a Compressor for in-memory use could +// live indefinitely. +// +// If a single thread is doing many compressions under the same strategy, it +// should request a WorkingArea that will in some cases make repeated +// compression in a single thread more efficient. Unlike the rest of Compressor, +// each WorkingArea can only be used by one thread at a time. WorkingAreas can +// have pre-allocated space and/or data structures, and/or thread-local +// statistics that are later incorporated into shared statistics objects. +// +// The Compressor marks each block with a CompressionType to guide +// decompression. However, the compression dictionary (or whether there is one +// associated) is determined at Compressor creation time, though the process of +// getting a Compressor with a dictionary starts with a Compressor without +// dictionary (which will often be relevant alongside); see relevant functions. +// If the Compressor wants to decide block-by-block whether to apply the +// configured dictionary, that would need to be encoded in CompressionType or +// the compressed output. (NOTE: this was historically NOT encoded in +// CompressionType and instead implied by BlockType and the presence of a +// dictionary block in the file. Some of the resulting awkwardness includes +// a number of built-in CompressionTypes that ignore any dictionary block in +// the file; therefore they cannot accommodate dictionary compression in the +// future without a schema change / extension.) +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Compressor { + public: // Auxiliary types + // No dictionary should be used (for a given block type). + struct DictDisabled {}; + + // A recommendation for dictionary compression by collecting samples from + // blocks. The caller should collect up to `max_sample_bytes` of sample data + // and pass it to MaybeCloneSpecialized() to create a specialized compressor. + struct DictSampling { + // Maximum total bytes of sample data to collect from blocks. + // This controls how much data is buffered before dictionary training. + size_t max_sample_bytes = 0; + }; + + // A pre-defined dictionary that is recommended or specified for direct use + // with MaybeCloneSpecialized(), without any sampling. + struct DictPreDefined { + // The owned raw/serialized dictionary bytes. Recommend std::move to + // MaybeCloneSpecialized() + std::string dict_data; + }; + + // The result type for GetDictGuidance() - indicates how dictionary + // compression should be configured for a given block type. + using DictConfig = std::variant; + + // Sample data collected from blocks for dictionary training. + struct DictSamples { + // All the sample input blocks stored contiguously + std::string sample_data; + // The lengths of each of the sample blocks in `sample_data` + std::vector sample_lens; + + bool empty() const { return sample_data.empty(); } + bool Verify() const { + size_t total_len = 0; + for (auto len : sample_lens) { + total_len += len; + } + return total_len == sample_data.size(); + } + }; + + // Arguments for MaybeCloneSpecialized() - provides either samples, a + // pre-defined dictionary, or indicates no dictionary should be used. + // NOTE: DictPreDefined here is the same type as above, allowing the + // pre-defined dictionary from GetDictGuidance() to be passed through. + using DictConfigArgs = + std::variant; + + // A WorkingArea is an optional structure (both for callers and + // implementations) that can enable optimizing repeated compressions by + // reusing working space or thread-local tracking of statistics or trends. + // This enables use of ZSTD context, for example. + // + // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations + struct WorkingArea {}; + + public: // Functions + Compressor() = default; + virtual ~Compressor() = default; + + // Class name for logging / debugging purposes + virtual const char* Name() const = 0; + + // Potentially more elaborate identifier for logging / debugging purposes + virtual std::string GetId() const { + std::string id = Name(); + return id; + } + + // Returns the recommended dictionary configuration for the given block type. + // See the comments on DictConfig and variants for details. + // + // NOTE: This may be called on the "base" Compressor returned by + // CompressionManager, which is not yet configured with a dictionary, + // or it can be skipped by callers not intending to handle dictionary + // compression. + virtual DictConfig GetDictGuidance(CacheEntryRole block_type) const { + // Default implementation: no dictionary + (void)block_type; + return DictDisabled{}; + } + + // Returns the serialized form of the data dictionary associated with this + // Compressor. NOTE: empty dict is equivalent to no dict. + virtual Slice GetSerializedDict() const { return Slice(); } + + // If there's a dominant compression type returned by this compressor as + // configured, return it. Otherwise, return kDisableCompressionOption. + virtual CompressionType GetPreferredCompressionType() const { + return CompressionType::kDisableCompressionOption; + } + + // Return a distinct but functionally equivalent Compressor. This is often + // needed to implement MaybeCloneSpecialized() in wrapper compressors. + virtual std::unique_ptr Clone() const = 0; + + // Create potential variants of the same Compressor that might be + // (a) optimized for a particular block type (does not affect correct + // decompression), and/or + // (b) configured to use a compression dictionary based on the provided + // configuration (samples or pre-defined dictionary). See the comments on + // DictConfigArgs and its variants for detail. + // + // Return of nullptr indicates no specialization exists or was attempted + // and the caller should use the current Compressor for the desired scenario. + // Using CacheEntryRole::kMisc for block_type generally means "unspecified". + // + // The exact dictionary associated with a returned compressor must be read + // from GetSerializedDict(). + virtual std::unique_ptr MaybeCloneSpecialized( + CacheEntryRole block_type, DictConfigArgs&& dict_config) const { + // Default implementation: no specialization + (void)block_type; + (void)dict_config; + return nullptr; + } + + // A convenience function when a clone is needed and may or may not be + // specialized. + std::unique_ptr CloneMaybeSpecialized( + CacheEntryRole block_type, DictConfigArgs&& dict_config) const { + auto clone = MaybeCloneSpecialized(block_type, std::move(dict_config)); + if (clone == nullptr) { + clone = Clone(); + assert(clone != nullptr); + } + return clone; + } + + // To allow for flexible re-use / reclaimation, we have explicit Get and + // Release functions, and usually wrap in a special RAII smart pointer. + // For example, a WorkingArea could be saved/recycled in thread-local or + // core-local storage, or heap managed, etc., though an explicit WorkingArea + // is only advised for repeated compression (by a single thread). + // ReleaseWorkingArea() in not intended to be called directly, but used by + // ManagedWorkingArea. + virtual void ReleaseWorkingArea(WorkingArea*) {} + + using ManagedWorkingArea = + ManagedPtr; + + // See struct WorkingArea above + virtual ManagedWorkingArea ObtainWorkingArea() { + // Default implementation: no working area + return {}; + } + + // Compress `uncompressed_data` to buffer `compressed_output` of size + // `*compressed_output_size`, storing the final compressed size in + // `*compressed_output_size` and compression type in `*out_compression_type`. + // Note that the compressed output will be decompressed by the sequence + // Decompressor::ExtractUncompressedSize() followed by + // Decompressor::DecompressBlock(), which must also be provided the same + // CompressionType saved in `out_compression_type`. (In many configurations, + // `compressed_output` will have a prefix storing the uncompressed_data size + // before the compressed bytes returned by the underlying compression + // algorithm. And the compression type is usually stored adjacent to the + // compressed data, or in some cases assumed/asserted based on the particular + // Compressor.) + // + // If return status is not OK, then some fatal condition has arisen. On OK + // status, setting `*out_compression_type = kNoCompression` means compression + // is declined and the caller should use the original uncompressed_data and + // ignore any result in `compressed_output`. In this case, setting + // *compressed_output_size to 0 suggests that compression was quickly + // "bypassed" and *compressed_output_size > 0 suggests that compression was + // attempted but rejected (e.g. insufficient compression ratio). + // + // On OK status and `*out_compression_type != kNoCompression`, compression has + // happened with results in `compressed_output`, `compressed_output_size`, and + // `out_compression_type`. The output compression type is allowed to vary from + // call to call but does not for compressors from BuiltinV2CompressionManager. + // + // The working area is optional and used to optimize repeated compression by + // a single thread. ManagedWorkingArea is provided rather than just + // WorkingArea so that it can be used only if the `owner` matches expectation. + // This could be useful for a Compressor wrapping more than one alternative + // underlying Compressor. + virtual Status CompressBlock(Slice uncompressed_data, char* compressed_output, + size_t* compressed_output_size, + CompressionType* out_compression_type, + ManagedWorkingArea* working_area) = 0; + + // OPTIONAL: Return a decompressor that is optimized for output from this + // compressor. + virtual std::shared_ptr GetOptimizedDecompressor() const { + // Default implementation: no optimization. Get a Decompressor from the + // CompressionManager. + return nullptr; + } + + // TODO: something to populate table properties based on settings, after all + // or as WorkingAreas released. Maybe also update stats, or that could be in + // thread-specific WorkingArea. +}; + +// A Decompressor usually has a wide capability to decompress all kinds of +// compressed data in the scope of a CompressionManager (see that class below), +// except +// (a) it might be optimized for or limited to a particular compression type(s) +// (see GetDecompressor* functions for in CompressionManager), +// (b) distinct Decompressors are required to decompress with compression +// dictionaries. (Decompressors are generally associated with empty/no +// dictionary unless created with MaybeCloneForDict().) +// +// Similar to Compressor, Decompressor is generally thread safe except that each +// WorkingArea can only be used by a single thread at a time. +// +// Decompressors known to be associated with no dictionary are typically +// returned as shared_ptr, because they are broadly usable across threads. +// Because compression dictionaries are externally managed (see +// MaybeCloneForDict()), Decompressors associated with compression dictionaries +// are typically returned as unique_ptr, so that they are more easily +// guaranteed not to outlive their dictionaries (e.g. in block cache). +// Decompressors associated with compression dictionaries might include a +// processed or "digested" form of the raw dictionary for efficient repeated +// compressions. +// +// NOTE: Splitting the interface between ExtractUncompressedSize and +// DecompressBlock leaves to the caller details of (and flexibility in) +// allocating buffers for decompressing into. For example, the data could be +// decompressed into part of a single buffer allocated to hold a block's +// uncompressed contents along with an in-memory object representation of the +// block (to reduce fragmentation and other overheads of separate objects). +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Decompressor { + public: + Decompressor() = default; + virtual ~Decompressor() = default; + + // A name for logging / debugging purposes + virtual const char* Name() const = 0; + + // A WorkingArea is an optional structure (both for callers and + // implementations) that can enable optimizing repeated decompressions by + // reusing working space or thread-local tracking of statistics. This enables + // use of ZSTD context, for example. + // + // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations + struct WorkingArea {}; + + // To allow for flexible re-use / reclaimation, we have explicit Obtain and + // Release functions, which are typically wrapped in a special RAII smart + // pointer. For example, a WorkingArea could be saved/recycled in thread-local + // or core-local storage, or heap managed, etc., though an explicit + // WorkingArea is only advised for repeated decompression (by a single + // thread). ReleaseWorkingArea() in not intended to be called directly, but + // used by ManagedWorkingArea. + virtual void ReleaseWorkingArea(WorkingArea* wa) { + // Default implementation: no working area + (void)wa; + assert(wa == nullptr); + } + + using ManagedWorkingArea = + ManagedPtr; + + virtual ManagedWorkingArea ObtainWorkingArea(CompressionType /*preferred*/) { + // Default implementation: no working area + return {}; + } + + // If this Decompressor is associated with a (de)compression dictionary + // (created with MaybeCloneForDict()), this returns a pointer to those raw (or + // "serialized") bytes, which are externally managed (see + // MaybeCloneForDict()). + // Default: empty slice => no dictionary + virtual const Slice& GetSerializedDict() const; + + // Create a variant of this Decompressor in `out` using the specified raw + // ("serialized") dictionary. This step is required for decompressing data + // compressed with the same dictionary. The new Decompressor references the + // given Slice through its lifetime so the data it points to must be managed + // by the caller along with (or beyond) the new Decompressor. If the + // dictionary is processed into a form reusable by repeated compressions in + // many threads, that happens within this call. + // + // Must return OK if and only if storing a result in `out`. Otherwise, could + // return values like NotSupported - dictionary compression is not (yet) + // supported for this kind of Decompressor. Corruption - dictionary is + // malformed (though many implementations will accept any data as a + // dictionary) + // + // RocksDB promises not to call this function with an empty dictionary slice + // (equivalent to no dictionary). + virtual Status MaybeCloneForDict(const Slice& /*serialized_dict*/, + std::unique_ptr* /*out*/) { + return Status::NotSupported( + "Dictionary compression not (yet) supported by " + std::string(Name())); + } + + // Memory size of this object and others it owns. Does not include the + // serialized dictionary (when used) which is externally managed. + virtual size_t ApproximateOwnedMemoryUsage() const { + // Default: negligible + return 0; + } + + // Potentially extensible by callers of Decompressor (but not recommended) + struct Args { + CompressionType compression_type = kNoCompression; + Slice compressed_data; + uint64_t uncompressed_size = 0; + ManagedWorkingArea* working_area = nullptr; + }; + + // For efficiency on the read path, RocksDB strongly prefers the uncompressed + // data size to be encoded in the compressed data in an easily accessible way, + // so that allocation of a potentially long-lived buffer can be ideally sized. + // This function determines the uncompressed size and potentially modifies + // `args.compressed_data` to strip off the size metadata, for providing both + // to DecompressBlock along with an appropriate buffer based on that size. + // Some implementations will leave `compressed_data` unmodified and let + // DecompressBlock call a library function that processes a format that + // includes size metadata (e.g. Snappy). + // + // Even for legacy cases without size metadata (e.g. some very old RocksDB + // formats), an exact size is required and could require decompressing the + // data (here and in DecompressBlock()). + // + // Return non-OK in case of corrupt data or some other unworkable limitation + // or failure. + // + // The default implementation uses a standard format for prepending + // uncompressed size to the compressed payload. (RocksDB + // compress_format_version=2 except Snappy) + virtual Status ExtractUncompressedSize(Args& args); + + // Called to decompress a block of data after running ExtractUncompressedSize + // on it. `args.compressed_data` is what ExtractUncompressedSize left there + // after potentially stripping off the uncompressed size metadata. Returns OK + // iff uncompressed data of size `uncompressed_size` is written to + // `uncompressed_output`. + virtual Status DecompressBlock(const Args& args, + char* uncompressed_output) = 0; +}; + +// A CompressionManager represents +// * When/where/how to use different compressions +// * A schema (or set of schemas) and implementation for mapping +// +// to uncompressed data (or error), which can expand over time (error in fewer +// cases) for a given CompatibilityName() but can never change that mapping +// (because that would break backward compatibility, potential quiet +// corruption) +// TODO: consider adding optional streaming compression support (low priority) +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompressionManager + : public std::enable_shared_from_this, + public Customizable { + public: + CompressionManager() = default; + virtual ~CompressionManager() = default; + static const char* Type() { return "CompressionManager"; } + + // *************** Creating various Compression Managers *************** // + // A name for the schema family of this CompressionManager. In short, if + // two CompressionManagers have functionally the same Decompressor(s), they + // should have the same CompatibilityName(), so that a compatible + // CompressionManager/Decompressor might be used if the original is + // unavailable. (Name() can be useful in addition to CompatibilityName() for + // understanding what compression strategy was used.) This name should be + // limited to legal variable names in C++ (alphanumeric and underscores). + virtual const char* CompatibilityName() const = 0; + + // Default implementation checks the current compatibility name and returns + // this CompressionManager (via `out`) if appropriate, and otherwise defers + // to CreateFromString(). Failure should simply be a matter of "not found" in + // which case nullptr is returned. + virtual std::shared_ptr FindCompatibleCompressionManager( + Slice compatibility_name); + + // Create or find a CompressionManager from a string, including built-in + // CompressionManager types. + // TODO: ObjectLibrary stuff + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result); + + // Returns false iff a configuration that would pass the given compression + // type to GetCompressor/GetCompressorForSST should be rejected (not + // supported) + virtual bool SupportsCompressionType(CompressionType type) const = 0; + + // TODO: function to check compatibility with or sanitize CompressionOptions + + // ************************* Compressor creation *********************** // + // Returning nullptr means compression is entirely disabled for the file, + // which is valid at the discretion of the CompressionManager. Returning + // nullptr should normally be the result if preferred == kNoCompression. + // + // Compressors returned here are configured WITHOUT a dictionary, so that + // it's always possible to get correct compression->decompression results + // if not opting-in to dictionary handling. The compressors may recommend + // dictionary usage via GetDictGuidance() and creating a modified Compressor + // for that. See Compressor::GetDictGuidance() etc. for details. + // + // These functions must be thread-safe. + + // Get a compressor for an SST file. + // SUBJECT TO CHANGE + // TODO: is it practical to get ColumnFamilyOptions plumbed into here? + virtual std::unique_ptr GetCompressorForSST( + const FilterBuildingContext&, const CompressionOptions& opts, + CompressionType preferred) { + return GetCompressor(opts, preferred); + } + + // Get a compressor for a generic/unspecified purpose (e.g. in-memory + // compression). + virtual std::unique_ptr GetCompressor( + const CompressionOptions& opts, CompressionType type) = 0; + + // **************************** Decompressors ************************** // + // Get a decompressor that is compatible with any blocks compressed by + // compressors returned by this CompressionManager (at least this code + // revision and earlier). (NOTE: recommended to return a shared_ptr alias of + // this shared_ptr to a field that is a Decompressor.) + // Justification for not making CompressionManager inherit Decompressor: this + // tends to run into the diamond inheritance problem in implementations and + // potential overheads of virtual inheritance. + virtual std::shared_ptr GetDecompressor() = 0; + + // Compatible with same as above, but potentially optimized for a certain + // expected CompressionType + virtual std::shared_ptr GetDecompressorOptimizeFor( + CompressionType /*optimize_for_type*/) { + // Safe default implementation + return GetDecompressor(); + } + + // Get a decompressor that is allowed to have support only for the + // CompressionTypes in the given start-to-end array (unique, sorted by + // unsigned char) + virtual std::shared_ptr GetDecompressorForTypes( + const CompressionType* /*types_begin*/, + const CompressionType* /*types_end*/) { + // Safe default implementation + return GetDecompressor(); + } +}; + +// ************************* Utility wrappers etc. *********************** // +class CompressorWrapper : public Compressor { + public: + explicit CompressorWrapper(std::unique_ptr compressor) + : wrapped_(std::move(compressor)) {} + // No copies + CompressorWrapper(const CompressorWrapper&) = delete; + CompressorWrapper& operator=(const CompressorWrapper&) = delete; + + DictConfig GetDictGuidance(CacheEntryRole block_type) const override { + return wrapped_->GetDictGuidance(block_type); + } + + Slice GetSerializedDict() const override { + return wrapped_->GetSerializedDict(); + } + + CompressionType GetPreferredCompressionType() const override { + return wrapped_->GetPreferredCompressionType(); + } + + // NOTE: Clone() not implemented here because it needs to be in the derived + // class + + // NOTE: MaybeCloneSpecialized() is only implemented here for convenience + // when the wrapped Compressor uses the default implementation of + // MaybeCloneSpecialized(). This needs to be overridden if not. + std::unique_ptr MaybeCloneSpecialized( + CacheEntryRole block_type, DictConfigArgs&& dict_config) const override { + auto clone = + wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_config)); + // Assert default no-op MaybeCloneSpecialized() + assert(clone == nullptr); + return clone; + } + + ManagedWorkingArea ObtainWorkingArea() override { + return wrapped_->ObtainWorkingArea(); + } + + // NOTE: Don't need to override ReleaseWorkingArea() here because + // ManagedWorkingArea takes care of calling it on the Compressor that created + // the WorkingArea. + + Status CompressBlock(Slice uncompressed_data, char* compressed_output, + size_t* compressed_output_size, + CompressionType* out_compression_type, + ManagedWorkingArea* working_area) override { + return wrapped_->CompressBlock(uncompressed_data, compressed_output, + compressed_output_size, out_compression_type, + working_area); + } + + std::shared_ptr GetOptimizedDecompressor() const override { + return wrapped_->GetOptimizedDecompressor(); + } + + protected: + std::unique_ptr wrapped_; +}; + +class DecompressorWrapper : public Decompressor { + public: + explicit DecompressorWrapper(std::shared_ptr decompressor) + : wrapped_(std::move(decompressor)) {} + // No copies + DecompressorWrapper(const DecompressorWrapper&) = delete; + DecompressorWrapper& operator=(const DecompressorWrapper&) = delete; + + const char* Name() const override { return wrapped_->Name(); } + + void ReleaseWorkingArea(WorkingArea* wa) override { + wrapped_->ReleaseWorkingArea(wa); + } + + // NOTE: Don't need to override ReleaseWorkingArea() here because + // ManagedWorkingArea takes care of calling it on the Decompressor that + // created the WorkingArea. + + ManagedWorkingArea ObtainWorkingArea(CompressionType preferred) override { + return wrapped_->ObtainWorkingArea(preferred); + } + + const Slice& GetSerializedDict() const override { + return wrapped_->GetSerializedDict(); + } + + Status MaybeCloneForDict(const Slice& serialized_dict, + std::unique_ptr* out) override { + // NOTE: derived class probably needs to override this to ensure a + // derived wrapper around the new Decompressor + return wrapped_->MaybeCloneForDict(serialized_dict, out); + } + + size_t ApproximateOwnedMemoryUsage() const override { + return wrapped_->ApproximateOwnedMemoryUsage(); + } + + Status ExtractUncompressedSize(Args& args) override { + return wrapped_->ExtractUncompressedSize(args); + } + + Status DecompressBlock(const Args& args, char* uncompressed_output) override { + return wrapped_->DecompressBlock(args, uncompressed_output); + } + + protected: + std::shared_ptr wrapped_; +}; + +// TODO: CompressorBase, for custom compressions + +class CompressionManagerWrapper : public CompressionManager { + public: + explicit CompressionManagerWrapper( + std::shared_ptr wrapped) + : wrapped_(std::move(wrapped)) {} + + const char* CompatibilityName() const override { + return wrapped_->CompatibilityName(); + } + + std::shared_ptr FindCompatibleCompressionManager( + Slice compatibility_name) override { + // NOTE: We expect that the wrapped CompressionManager will generally + // be preferred if compatible, so the default implementation here does + // not purely defer to the wrapped instance + if (compatibility_name == CompatibilityName()) { + return shared_from_this(); + } else { + return wrapped_->FindCompatibleCompressionManager(compatibility_name); + } + } + + bool SupportsCompressionType(CompressionType type) const override { + return wrapped_->SupportsCompressionType(type); + } + + std::unique_ptr GetCompressorForSST( + const FilterBuildingContext& context, const CompressionOptions& opts, + CompressionType preferred) override { + return wrapped_->GetCompressorForSST(context, opts, preferred); + } + + std::unique_ptr GetCompressor(const CompressionOptions& opts, + CompressionType type) override { + return wrapped_->GetCompressor(opts, type); + } + + std::shared_ptr GetDecompressor() override { + return wrapped_->GetDecompressor(); + } + + std::shared_ptr GetDecompressorOptimizeFor( + CompressionType optimize_for_type) override { + return wrapped_->GetDecompressorOptimizeFor(optimize_for_type); + } + + std::shared_ptr GetDecompressorForTypes( + const CompressionType* types_begin, + const CompressionType* types_end) override { + return wrapped_->GetDecompressorForTypes(types_begin, types_end); + } + + protected: + std::shared_ptr wrapped_; +}; + +// Compression manager that implements the second schema for RocksDB built-in +// compression support. (The first schema is intentionally not provided here.) +// *** CURRENT STATE *** +// This is currently the latest schema for built-in compression, and the +// compression manager used when compression_manager=nullptr. +const std::shared_ptr& GetBuiltinV2CompressionManager(); + +// NOTE: No GetLatestBuiltinCompressionManager() is provided because that could +// lead to unexpected schema changes for user CompressionManagers building on +// the built-in schema, in the unlikely/rare case of a new built-in schema. + +// Creates CompressionManager designed for the automated compression strategy. +// This may include deciding to compress or not. +// EXPERIMENTAL +std::shared_ptr CreateAutoSkipCompressionManager( + std::shared_ptr wrapped = nullptr); +// Creates CompressionManager designed for the CPU and IO cost aware compression +// strategy +// EXPERIMENTAL +std::shared_ptr CreateCostAwareCompressionManager( + std::shared_ptr wrapped = nullptr); +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/advanced_iterator.h b/include/rocksdb/advanced_iterator.h new file mode 100644 index 000000000000..abab5aeb4574 --- /dev/null +++ b/include/rocksdb/advanced_iterator.h @@ -0,0 +1,36 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +enum class IterBoundCheck : char { + kUnknown = 0, + kOutOfBound, + kInbound, +}; + +// This structure encapsulates the result of NextAndGetResult() +struct IterateResult { + // The lifetime of key is guaranteed until Next()/NextAndGetResult() is + // called. + Slice key; + // If the iterator becomes invalid after a NextAndGetResult(), the table + // iterator should set this to indicate whether it became invalid due + // to the next key being out of bound (kOutOfBound) or it reached end + // of file (kUnknown). If the iiterator is still valid, this should + // be set to kInbound. + IterBoundCheck bound_check_result = IterBoundCheck::kUnknown; + // If false, PrepareValue() needs to be called before value() + // This is useful if the table reader wants to materialize the value in a + // lazy manner. In that case, it can set this to false and RocksDB + // guarantees that it'll call PrepareValue() before calling value(). + bool value_prepared = true; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index ad9b90f735bb..898d07a6021d 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -64,9 +64,7 @@ enum CompactionPri : char { struct FileTemperatureAge { Temperature temperature = Temperature::kUnknown; uint64_t age = 0; -#if __cplusplus >= 202002L bool operator==(const FileTemperatureAge& rhs) const = default; -#endif }; struct CompactionOptionsFIFO { @@ -115,14 +113,71 @@ struct CompactionOptionsFIFO { // Default: empty std::vector file_temperature_age_thresholds{}; + // EXPERIMENTAL + // If true, when compaction is picked for kChangeTemperature reason, + // allow the trivia copy of the sst file from source FileSystem to + // destination FileSystem. If false, the changeTemperature will be + // the non-trivial copy by iterating/appending blocks by blocks of the + // sst file. + bool allow_trivial_copy_when_change_temperature = false; + + // EXPERIMENTAL + // If 'allow_trivia_copy_op_when_change_temperature=true', the tmp buffer size + // to copy the file from the source FileSystem to the destnation FileSystem. + // If 'allow_trivia_copy_op_when_change_temperature=false', this field will + // not be used. The minmum buffer size must be at least 4KiB + uint64_t trivial_copy_buffer_size = 4096; + + // When non-zero, FIFO compaction uses the combined size of SST files and + // blob files for size-based trimming decisions. When the total data size + // (SST + blob) exceeds this limit, the oldest SST files are dropped along + // with their associated blob files. + // + // When non-zero, this takes precedence over max_table_files_size for all + // FIFO compaction decisions: size-based dropping, TTL threshold checks, + // and compaction score computation. max_table_files_size is ignored. + // + // When zero (default), FIFO compaction uses max_table_files_size which + // only considers SST file sizes, maintaining backward compatibility. + // + // This option is primarily intended for use with integrated BlobDB where + // blob files can represent a significant portion of the total data. + // + // Dynamically changeable through SetOptions() API. + // Default: 0 (use max_table_files_size behavior) + uint64_t max_data_files_size = 0; + + // When true, enables a capacity-derived intra-L0 compaction strategy + // optimized for BlobDB workloads where SST files are much smaller than + // write_buffer_size. Uses the observed key/value size ratio (SST vs blob + // file sizes) to compute a target compacted file size, producing uniform + // files for predictable FIFO trimming. + // + // Uses level0_file_num_compaction_trigger as the target max L0 file count. + // + // When max_compaction_bytes is 0, the target is auto-calculated from the + // data capacity and observed SST/blob ratio. When max_compaction_bytes is + // explicitly set to a non-zero value, it overrides the auto-calculated + // target. + // + // Requires: + // - allow_compaction = true (master switch for intra-L0 compaction) + // - max_data_files_size > 0 (needed to compute the target file size) + // Setting this to true without these will fail option validation. + // + // When false, the old intra-L0 strategy is used if allow_compaction is + // true (PickCostBasedIntraL0Compaction with 1.1 * write_buffer_size guard). + // + // Dynamically changeable through SetOptions() API. + // Default: false + bool use_kv_ratio_compaction = false; + CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) : max_table_files_size(_max_table_files_size), allow_compaction(_allow_compaction) {} -#if __cplusplus >= 202002L bool operator==(const CompactionOptionsFIFO& rhs) const = default; -#endif }; // The control option of how the cache tiers will be used. Currently rocksdb @@ -145,6 +200,61 @@ enum class PrepopulateBlobCache : uint8_t { kFlushOnly = 0x1, // Prepopulate blobs during flush only }; +// Bitmask enum for verify output flags during compaction. +// This allows fine-grained control over what verification is performed +// on compaction output files and when it's enabled. +enum class VerifyOutputFlags : uint32_t { + kVerifyNone = 0x0, // No verification + + // First set of bits: type of verifications + kVerifyBlockChecksum = 1 << 0, // Verify block checksums + kVerifyIteration = 1 << 1, // Verify iteration and full key/value hash + // by comparing the one inserted into a + // file, and what is read back. + + // TODO - Implement + // kVerifyFileChecksum = 1 << 2, // Verify file-level checksum + + // Second set of bits: when to enable verification + kEnableForLocalCompaction = 1 << 10, // Enable for local compaction + kEnableForRemoteCompaction = 1 << 11, // Enable for remote compaction + + // TODO - Implement + // kEnableForFlush = 1 << 12, // Enable for flush + + kVerifyAll = 0xFFFFFFFF, +}; + +inline VerifyOutputFlags operator|(VerifyOutputFlags lhs, + VerifyOutputFlags rhs) { + using T = std::underlying_type_t; + return static_cast(static_cast(lhs) | + static_cast(rhs)); +} + +inline VerifyOutputFlags& operator|=(VerifyOutputFlags& lhs, + VerifyOutputFlags rhs) { + lhs = lhs | rhs; + return lhs; +} + +inline VerifyOutputFlags operator&(VerifyOutputFlags lhs, + VerifyOutputFlags rhs) { + using T = std::underlying_type_t; + return static_cast(static_cast(lhs) & + static_cast(rhs)); +} + +inline VerifyOutputFlags& operator&=(VerifyOutputFlags& lhs, + VerifyOutputFlags rhs) { + lhs = lhs & rhs; + return lhs; +} + +inline bool operator!(VerifyOutputFlags flag) { + return flag == VerifyOutputFlags::kVerifyNone; +} + struct AdvancedColumnFamilyOptions { // The maximum number of write buffers that are built up in memory. // The default and the minimum number is 2, so that when 1 write buffer @@ -171,15 +281,6 @@ struct AdvancedColumnFamilyOptions { // Default: 1 int min_write_buffer_number_to_merge = 1; - // DEPRECATED - // The total maximum number of write buffers to maintain in memory including - // copies of buffers that have already been flushed. Unlike - // max_write_buffer_number, this parameter does not affect flushing. - // This parameter is being replaced by max_write_buffer_size_to_maintain. - // If both parameters are set to non-zero values, this parameter will be - // ignored. - int max_write_buffer_number_to_maintain = 0; - // The target number of write history bytes to hold in memory. Write history // comprises the latest write buffers (memtables). To reach the target, write // buffers that were most recently flushed to SST files may be retained in @@ -471,6 +572,17 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API int target_file_size_multiplier = 1; + // If true, RocksDB will consider the estimated tail size (filter + index + + // meta blocks) when deciding whether to cut a compaction output file. This + // helps prevent output files from exceeding the target_file_size_base due to + // large tail blocks. When disabled, only the data block size is considered, + // which may result in SST files exceeding the target_file_size_base. + // + // Default: false + // + // Dynamically changeable through SetOptions() API + bool target_file_size_is_upper_bound = false; + // If true, RocksDB will pick target size of each level dynamically. // We will pick a base level b >= 1. L0 will be directly merged into level b, // instead of always into level 1. Level 1 to b-1 need to be empty. @@ -520,7 +632,7 @@ struct AdvancedColumnFamilyOptions { // By doing it, we give max_bytes_for_level_multiplier a priority against // max_bytes_for_level_base, for a more predictable LSM tree shape. It is // useful to limit worse case space amplification. - // If `allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`, + // If `cf_allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`, // then the last level is reserved, and we will start filling LSM from the // second last level. // @@ -575,6 +687,15 @@ struct AdvancedColumnFamilyOptions { // // Default: target_file_size_base * 25 // + // For FIFO compaction with use_kv_ratio_compaction=true: + // When set to 0 (and compaction_style is FIFO), the value is NOT sanitized + // to the default. Instead, the target compacted file size is automatically + // calculated from the data capacity (max_data_files_size) and observed + // SST/blob ratio. When explicitly set to a non-zero value, it overrides + // the auto-calculated target and is used directly as the max compaction + // input size. Note: for FIFO, this controls the output file size target, + // not a general compaction byte limit as in level/universal compaction. + // // Dynamically changeable through SetOptions() API uint64_t max_compaction_bytes = 0; @@ -702,6 +823,13 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API bool paranoid_file_checks = false; + // Bitmask enum for output verification option. + // + // Default: 0 (kVerifyNone) + // + // Dynamically changeable (as a uint32_t) through SetOptions() API. + VerifyOutputFlags verify_output_flags = VerifyOutputFlags::kVerifyNone; + // In debug mode, RocksDB runs consistency checks on the LSM every time the // LSM changes (Flush, Compaction, AddFile). When this option is true, these // checks are also enabled in release mode. These checks were historically @@ -719,6 +847,17 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API bool report_bg_io_stats = false; + // Setting this option to true disallows ordinary writes to the column family + // and it can only be populated through import and ingestion. It is intended + // to protect "ingestion only" column families. This option is not currently + // supported on the default column family because of error handling challenges + // analogous to https://github.com/facebook/rocksdb/issues/13429 + // + // This option is not mutable with SetOptions(). It can be changed between + // DB::Open() calls, but open will fail if recovering WAL writes to a CF with + // this option set. + bool disallow_memtable_writes = false; + // This option has different meanings for different compaction styles: // // Leveled: Non-bottom-level files with all keys older than TTL will go @@ -846,7 +985,7 @@ struct AdvancedColumnFamilyOptions { // // Default: 0 (disable the feature) // - // Not dynamically changeable, change it requires db restart. + // Dynamically changeable through the SetOptions() API uint64_t preclude_last_level_data_seconds = 0; // EXPERIMENTAL @@ -869,7 +1008,7 @@ struct AdvancedColumnFamilyOptions { // // Default: 0 (disable the feature) // - // Not dynamically changeable, change it requires db restart. + // Dynamically changeable through the SetOptions() API uint64_t preserve_internal_time_seconds = 0; // When set, large values (blobs) are written to separate blob files, and @@ -1088,12 +1227,84 @@ struct AdvancedColumnFamilyOptions { uint32_t bottommost_file_compaction_delay = 0; // Enables additional integrity checks during reads/scans. - // Specifically, for skiplist-based memtables, we verify that keys visited - // are in order. This is helpful to detect corrupted memtable keys during - // reads. Enabling this feature incurs a performance overhead due to an - // additional key comparison during memtable lookup. + // Specifically, for skiplist-based memtables, key ordering validation could + // be enabled optionally. This is helpful to detect corrupted memtable keys + // during reads. Enabling this feature incurs a performance overhead due to + // additional comparison during memtable lookup. bool paranoid_memory_checks = false; + // Enables additional integrity checks during seek. + // Specifically, for skiplist-based memtables, key checksum validation could + // be enabled during seek optionally. This is helpful to detect corrupted + // memtable keys during reads. Enabling this feature incurs a performance + // overhead due to additional key checksum validation during memtable seek + // operation. + // This option depends on memtable_protection_bytes_per_key to be non zero. + // If memtable_protection_bytes_per_key is zero, no validation is performed. + bool memtable_veirfy_per_key_checksum_on_seek = false; + + // When an iterator scans this number of invisible entries (tombstones or + // hidden puts) from the active memtable during a single iterator operation, + // we will attempt to flush the memtable. Currently only forward scans are + // supported (SeekToFirst(), Seek() and Next()). + // This option helps to reduce the overhead of scanning through a + // large number of entries in memtable. + // Users should consider enable deletion-triggered-compaction (see + // CompactOnDeletionCollectorFactory) together with this option to compact + // away tombstones after the memtable is flushed. + // + // Note that this option has no effect on tailing iterators yet. + // + // Default: 0 (disabled) + // Dynamically changeable through the SetOptions() API. + uint32_t memtable_op_scan_flush_trigger = 0; + + // Similar to `memtable_op_scan_flush_trigger`, but this option applies to + // Next() calls between Seeks or until iterator destruction. If the average + // of the number of invisible entries scanned from the active memtable, the + // memtable will be marked for flush. + // Note that to avoid the case where the window between Seeks is too small, + // the option only takes effect if the total number of hidden entries scanned + // within a window is at least `memtable_op_scan_flush_trigger`. So this + // option is only effective when `memtable_op_scan_flush_trigger` is set. + // + // This option should be set to a lower value than + // `memtable_op_scan_flush_trigger`. It covers the case where an iterator + // scans through an expensive key range with many invisible entries from the + // active memtable, but the number of invisible entries per operation does not + // exceed `memtable_op_scan_flush_trigger`. + // + // Default: 0 (disabled) + // Dynamically changeable through the SetOptions() API. + uint32_t memtable_avg_op_scan_flush_trigger = 0; + + // If either DBOptions::allow_ingest_behind or this option is set to true, + // this column family will prepare for ingesting files to the last level + // (IngestExternalFiles() with ingest_behind=true). Users should set only + // this option since DBOptions::allow_ingest_behind is deprecated. + // + // Specifically, preparing a column family for ingesting files to the last + // level has the following effects: + // 1) Disables some internal optimizations around SST file compression. + // 2) Reserves the last level for ingested files only. + // 3) Compaction will not include any file from the last level. + // 4) Compaction will preserve necessary tombstones that can apply on + // top of ingested files. + // + // Note that only Universal Compaction supports cf_allow_ingest_behind. + // `num_levels` should be >= 3 if this option is turned on. + // + // Note that this option needs to be set to true before any write to the CF. + // It's recommended to set the option to true since CF creation. Otherwise, + // ingestion with ingest_behind = true might fail. Once file ingestions are + // done, the option should be flipped to false. Flipping this option to false + // allows the CF to disable the behavior changes detailed above and resume + // more efficient operation. + // + // Default: false + // Immutable. + bool cf_allow_ingest_behind = false; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index f2616ea3e7f8..3ab0c8551d34 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -69,6 +69,7 @@ extern "C" { /* Exported types */ typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_status_ptr_t rocksdb_status_ptr_t; typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t; @@ -79,11 +80,18 @@ typedef struct rocksdb_hyper_clock_cache_options_t rocksdb_hyper_clock_cache_options_t; typedef struct rocksdb_cache_t rocksdb_cache_t; typedef struct rocksdb_write_buffer_manager_t rocksdb_write_buffer_manager_t; +typedef struct rocksdb_sst_file_manager_t rocksdb_sst_file_manager_t; typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t; typedef struct rocksdb_compactionfiltercontext_t rocksdb_compactionfiltercontext_t; typedef struct rocksdb_compactionfilterfactory_t rocksdb_compactionfilterfactory_t; +typedef struct rocksdb_file_checksum_gen_factory_t + rocksdb_file_checksum_gen_factory_t; +typedef struct rocksdb_sst_partitioner_factory_t + rocksdb_sst_partitioner_factory_t; +typedef struct rocksdb_table_properties_collector_factory_t + rocksdb_table_properties_collector_factory_t; typedef struct rocksdb_comparator_t rocksdb_comparator_t; typedef struct rocksdb_dbpath_t rocksdb_dbpath_t; typedef struct rocksdb_env_t rocksdb_env_t; @@ -111,10 +119,15 @@ typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t; typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t; +typedef struct rocksdb_livefile_t rocksdb_livefile_t; typedef struct rocksdb_livefiles_t rocksdb_livefiles_t; typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t; typedef struct rocksdb_column_family_metadata_t rocksdb_column_family_metadata_t; +typedef struct rocksdb_import_column_family_options_t + rocksdb_import_column_family_options_t; +typedef struct rocksdb_export_import_files_metadata_t + rocksdb_export_import_files_metadata_t; typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t; typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t; typedef struct rocksdb_envoptions_t rocksdb_envoptions_t; @@ -142,6 +155,48 @@ typedef struct rocksdb_statistics_histogram_data_t typedef struct rocksdb_wait_for_compact_options_t rocksdb_wait_for_compact_options_t; +/* rocksdb_slice_t: Optimized slice type for high-performance C API operations + * This struct is ABI-compatible with rocksdb::Slice for zero-copy interop. + * Used by slice iterator functions and batched operations. */ +typedef struct rocksdb_slice_t { + const char* data; + size_t size; +} rocksdb_slice_t; +typedef struct rocksdb_flushjobinfo_t rocksdb_flushjobinfo_t; +typedef struct rocksdb_compactionjobinfo_t rocksdb_compactionjobinfo_t; +typedef struct rocksdb_subcompactionjobinfo_t rocksdb_subcompactionjobinfo_t; +typedef struct rocksdb_externalfileingestioninfo_t + rocksdb_externalfileingestioninfo_t; +typedef struct rocksdb_eventlistener_t rocksdb_eventlistener_t; +typedef struct rocksdb_writestallinfo_t rocksdb_writestallinfo_t; +typedef struct rocksdb_writestallcondition_t rocksdb_writestallcondition_t; +typedef struct rocksdb_memtableinfo_t rocksdb_memtableinfo_t; + +// Remote Compaction typedef +typedef struct rocksdb_compactionservice_scheduleresponse_t + rocksdb_compactionservice_scheduleresponse_t; +typedef struct rocksdb_compactionservice_jobinfo_t + rocksdb_compactionservice_jobinfo_t; +typedef struct rocksdb_compactionservice_t rocksdb_compactionservice_t; +typedef struct rocksdb_compaction_service_options_override_t + rocksdb_compaction_service_options_override_t; +typedef struct rocksdb_open_and_compact_options_t + rocksdb_open_and_compact_options_t; +typedef rocksdb_compactionservice_scheduleresponse_t* ( + *rocksdb_compaction_service_schedule_cb)( + void* state, const rocksdb_compactionservice_jobinfo_t* info, + const char* compaction_service_input, size_t input_len); + +typedef int (*rocksdb_compaction_service_wait_cb)(void* state, + const char* scheduled_job_id, + char** result, + size_t* result_len); + +typedef void (*rocksdb_compaction_service_cancel_awaiting_jobs_cb)(void* state); + +typedef void (*rocksdb_compaction_service_on_installation_cb)( + void* state, const char* scheduled_job_id, int status); + /* DB operations */ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open( @@ -366,6 +421,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create( rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir, uint64_t log_size_for_flush, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_export_import_files_metadata_t* +rocksdb_checkpoint_export_column_family( + rocksdb_checkpoint_t* checkpoint, + rocksdb_column_family_handle_t* column_family, const char* export_dir, + char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy( rocksdb_checkpoint_t* checkpoint); @@ -426,6 +487,13 @@ rocksdb_create_column_families(rocksdb_t* db, extern ROCKSDB_LIBRARY_API void rocksdb_create_column_families_destroy( rocksdb_column_family_handle_t** list); +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_create_column_family_with_import( + rocksdb_t* db, rocksdb_options_t* column_family_options, + const char* column_family_name, + rocksdb_import_column_family_options_t* import_options, + rocksdb_export_import_files_metadata_t* metadata, char** errptr); + extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl( rocksdb_t* db, const rocksdb_options_t* column_family_options, @@ -581,6 +649,16 @@ extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf( const char* const* keys_list, const size_t* keys_list_sizes, rocksdb_pinnableslice_t** values, char** errs, const bool sorted_input); +/* Batched MultiGet with slice array: Takes rocksdb_slice_t array directly, + * avoiding key conversion. faster than rocksdb_batched_multi_get_cf for + * operations with many keys. Eliminates overhead of converting keys from + * separate pointer+size arrays to Slice objects. */ +extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf_slice( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, size_t num_keys, + const rocksdb_slice_t* keys_list, rocksdb_pinnableslice_t** values, + char** errs, const bool sorted_input); + // The value is only allocated (using malloc) and returned if it is found and // value_found isn't NULL. In that case the user is responsible for freeing it. extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist( @@ -747,6 +825,18 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp( const rocksdb_iterator_t*, size_t* tslen); extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error( const rocksdb_iterator_t*, char** errptr); + +/* Slice iterator functions: Return rocksdb_slice_t directly for better + * performance. These functions avoid the overhead of passing output parameters + * and provide zero-copy access to key/value/timestamp data. faster than + * traditional rocksdb_iter_key/value/timestamp functions. */ +extern ROCKSDB_LIBRARY_API rocksdb_slice_t +rocksdb_iter_key_slice(const rocksdb_iterator_t* iter); +extern ROCKSDB_LIBRARY_API rocksdb_slice_t +rocksdb_iter_value_slice(const rocksdb_iterator_t* iter); +extern ROCKSDB_LIBRARY_API rocksdb_slice_t +rocksdb_iter_timestamp_slice(const rocksdb_iterator_t* iter); + extern ROCKSDB_LIBRARY_API void rocksdb_iter_refresh( const rocksdb_iterator_t* iter, char** errptr); @@ -860,6 +950,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate( rocksdb_writebatch_t*, void* state, void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*deleted)(void*, const char* k, size_t klen)); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_ld( + rocksdb_writebatch_t*, void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen), + void (*log_data)(void*, const char* blob, size_t blob_len)); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf( rocksdb_writebatch_t*, void* state, void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen, @@ -867,6 +962,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf( void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen), void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen, const char* v, size_t vlen)); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf_ld( + rocksdb_writebatch_t*, void* state, + void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen, + const char* v, size_t vlen), + void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen), + void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen, + const char* v, size_t vlen), + void (*log_data)(void*, const char* blob, size_t blob_len)); extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data( rocksdb_writebatch_t*, size_t* size); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point( @@ -986,11 +1089,22 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db( rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, size_t keylen, size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_writebatch_wi_get_pinned_from_batch_and_db( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, + char** errptr); extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, const rocksdb_readoptions_t* options, rocksdb_column_family_handle_t* column_family, const char* key, size_t keylen, size_t* vallen, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* +rocksdb_writebatch_wi_get_pinned_from_batch_and_db_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi( rocksdb_t* db, const rocksdb_writeoptions_t* options, rocksdb_writebatch_wi_t* wbwi, char** errptr); @@ -998,13 +1112,20 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base( rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator); extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_writebatch_wi_create_iterator_with_base_readopts( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + const rocksdb_readoptions_t* options); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, rocksdb_column_family_handle_t* cf); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + rocksdb_column_family_handle_t* cf, const rocksdb_readoptions_t* options); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_update_timestamps( rocksdb_writebatch_wi_t* wbwi, const char* ts, size_t tslen, void* state, size_t (*get_ts_size)(void*, uint32_t), char** errptr); - /* Options utils */ // Load the latest rocksdb options from the specified db_path. @@ -1088,6 +1209,13 @@ enum { extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_data_block_index_type( rocksdb_block_based_table_options_t*, int); // uses one of the above enums +enum { + rocksdb_block_based_table_index_block_search_type_binary = 0, + rocksdb_block_based_table_index_block_search_type_interpolation = 1, +}; +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_index_block_search_type( + rocksdb_block_based_table_options_t*, int); // uses one of the above enums extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_data_block_hash_ratio( rocksdb_block_based_table_options_t* options, double v); @@ -1123,8 +1251,150 @@ rocksdb_block_based_options_set_partition_pinning_tier( extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_unpartitioned_pinning_tier( rocksdb_block_based_table_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_align( + rocksdb_block_based_table_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_manager( rocksdb_options_t* opt, rocksdb_write_buffer_manager_t* wbm); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_sst_file_manager( + rocksdb_options_t* opt, rocksdb_sst_file_manager_t* sfm); + +/* Flush job info */ + +extern ROCKSDB_LIBRARY_API const char* rocksdb_flushjobinfo_cf_name( + const rocksdb_flushjobinfo_t*, size_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_flushjobinfo_file_path( + const rocksdb_flushjobinfo_t*, size_t*); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_flushjobinfo_triggered_writes_slowdown(const rocksdb_flushjobinfo_t*); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_flushjobinfo_triggered_writes_stop(const rocksdb_flushjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_flushjobinfo_largest_seqno(const rocksdb_flushjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_flushjobinfo_smallest_seqno(const rocksdb_flushjobinfo_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_reset_status( + rocksdb_status_ptr_t* status_ptr); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_flushjobinfo_flush_reason(const rocksdb_flushjobinfo_t* info); +extern ROCKSDB_LIBRARY_API void rocksdb_status_ptr_get_error( + rocksdb_status_ptr_t* status, char** errptr); + +/* Compaction job info */ +extern ROCKSDB_LIBRARY_API void rocksdb_compactionjobinfo_status( + const rocksdb_compactionjobinfo_t* info, char** errptr); +extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_cf_name( + const rocksdb_compactionjobinfo_t*, size_t*); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_compactionjobinfo_input_files_count(const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_input_file_at( + const rocksdb_compactionjobinfo_t*, size_t pos, size_t*); +extern ROCKSDB_LIBRARY_API size_t rocksdb_compactionjobinfo_output_files_count( + const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_output_file_at( + const rocksdb_compactionjobinfo_t*, size_t pos, size_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_compactionjobinfo_elapsed_micros(const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_compactionjobinfo_num_corrupt_keys(const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_compactionjobinfo_base_input_level( + const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_compactionjobinfo_output_level( + const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_compactionjobinfo_input_records(const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_compactionjobinfo_output_records(const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_compactionjobinfo_total_input_bytes(const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_compactionjobinfo_total_output_bytes( + const rocksdb_compactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint32_t rocksdb_compactionjobinfo_compaction_reason( + const rocksdb_compactionjobinfo_t* info); +extern ROCKSDB_LIBRARY_API size_t rocksdb_compactionjobinfo_num_input_files( + const rocksdb_compactionjobinfo_t* info); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_compactionjobinfo_num_input_files_at_output_level( + const rocksdb_compactionjobinfo_t* info); + +/* Subcompaction job info */ +extern ROCKSDB_LIBRARY_API void rocksdb_subcompactionjobinfo_status( + const rocksdb_subcompactionjobinfo_t*, char**); +extern ROCKSDB_LIBRARY_API const char* rocksdb_subcompactionjobinfo_cf_name( + const rocksdb_subcompactionjobinfo_t*, size_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_subcompactionjobinfo_thread_id(const rocksdb_subcompactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_subcompactionjobinfo_base_input_level( + const rocksdb_subcompactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_subcompactionjobinfo_output_level( + const rocksdb_subcompactionjobinfo_t*); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_subcompactionjobinfo_compaction_reason( + const rocksdb_subcompactionjobinfo_t* info); + +/* External file ingestion info */ +extern ROCKSDB_LIBRARY_API const char* +rocksdb_externalfileingestioninfo_cf_name( + const rocksdb_externalfileingestioninfo_t*, size_t*); +extern ROCKSDB_LIBRARY_API const char* +rocksdb_externalfileingestioninfo_internal_file_path( + const rocksdb_externalfileingestioninfo_t*, size_t*); + +/* External write stall info */ +extern ROCKSDB_LIBRARY_API const char* rocksdb_writestallinfo_cf_name( + const rocksdb_writestallinfo_t*, size_t*); +extern ROCKSDB_LIBRARY_API const rocksdb_writestallcondition_t* +rocksdb_writestallinfo_cur(const rocksdb_writestallinfo_t*); +extern ROCKSDB_LIBRARY_API const rocksdb_writestallcondition_t* +rocksdb_writestallinfo_prev(const rocksdb_writestallinfo_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_memtableinfo_cf_name( + const rocksdb_memtableinfo_t*, size_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_memtableinfo_first_seqno(const rocksdb_memtableinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_memtableinfo_earliest_seqno(const rocksdb_memtableinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_memtableinfo_num_entries(const rocksdb_memtableinfo_t*); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_memtableinfo_num_deletes(const rocksdb_memtableinfo_t*); + +/* Event listener */ + +typedef void (*on_flush_begin_cb)(void*, rocksdb_t*, + const rocksdb_flushjobinfo_t*); +typedef void (*on_flush_completed_cb)(void*, rocksdb_t*, + const rocksdb_flushjobinfo_t*); +typedef void (*on_compaction_begin_cb)(void*, rocksdb_t*, + const rocksdb_compactionjobinfo_t*); +typedef void (*on_compaction_completed_cb)(void*, rocksdb_t*, + const rocksdb_compactionjobinfo_t*); +typedef void (*on_subcompaction_begin_cb)( + void*, const rocksdb_subcompactionjobinfo_t*); +typedef void (*on_subcompaction_completed_cb)( + void*, const rocksdb_subcompactionjobinfo_t*); +typedef void (*on_external_file_ingested_cb)( + void*, rocksdb_t*, const rocksdb_externalfileingestioninfo_t*); +typedef void (*on_background_error_cb)(void*, uint32_t, rocksdb_status_ptr_t*); +typedef void (*on_stall_conditions_changed_cb)(void*, + const rocksdb_writestallinfo_t*); +typedef void (*rocksdb_logger_logv_cb)(void*, uint32_t log_level, const char*); +typedef void (*on_memtable_sealed_cb)(void*, const rocksdb_memtableinfo_t*); +extern ROCKSDB_LIBRARY_API rocksdb_eventlistener_t* +rocksdb_eventlistener_create( + void* state_, void (*destructor_)(void*), on_flush_begin_cb on_flush_begin, + on_flush_completed_cb on_flush_completed, + on_compaction_begin_cb on_compaction_begin, + on_compaction_completed_cb on_compaction_completed, + on_subcompaction_begin_cb on_subcompaction_begin, + on_subcompaction_completed_cb on_subcompaction_completed, + on_external_file_ingested_cb on_external_file_ingested, + on_background_error_cb on_background_error, + on_stall_conditions_changed_cb on_stall_conditions_changed, + on_memtable_sealed_cb on_memtable_sealed); +extern ROCKSDB_LIBRARY_API void rocksdb_eventlistener_destroy( + rocksdb_eventlistener_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_add_eventlistener( + rocksdb_options_t*, rocksdb_eventlistener_t*); /* Cuckoo table options */ @@ -1229,6 +1499,31 @@ rocksdb_logger_create_callback_logger(int log_level, void* priv); extern ROCKSDB_LIBRARY_API void rocksdb_logger_destroy( rocksdb_logger_t* logger); + +/* File Checksum Gen Factory */ +extern ROCKSDB_LIBRARY_API rocksdb_file_checksum_gen_factory_t* +rocksdb_file_checksum_gen_crc32c_factory_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_file_checksum_gen_factory_destroy( + rocksdb_file_checksum_gen_factory_t* factory); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_file_checksum_gen_factory( + rocksdb_options_t*, rocksdb_file_checksum_gen_factory_t*); + +/* SST Partitioner Factory */ +extern ROCKSDB_LIBRARY_API rocksdb_sst_partitioner_factory_t* +rocksdb_sst_partitioner_fixed_prefix_factory_create(size_t prefix_len); +extern ROCKSDB_LIBRARY_API void rocksdb_sst_partitioner_factory_destroy( + rocksdb_sst_partitioner_factory_t* factory); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_sst_partitioner_factory( + rocksdb_options_t*, rocksdb_sst_partitioner_factory_t*); + +/* Table Properties Collector Factory */ +extern ROCKSDB_LIBRARY_API void +rocksdb_table_properties_collector_factory_destroy( + rocksdb_table_properties_collector_factory_t* factory); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_add_table_properties_collector_factory( + rocksdb_options_t*, rocksdb_table_properties_collector_factory_t*); + extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size( rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API size_t @@ -1341,6 +1636,17 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_periodic_compaction_seconds( rocksdb_options_t*, uint64_t); extern ROCKSDB_LIBRARY_API uint64_t rocksdb_options_get_periodic_compaction_seconds(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_memtable_op_scan_flush_trigger(rocksdb_options_t*, + uint32_t); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_options_get_memtable_op_scan_flush_trigger(rocksdb_options_t*); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_memtable_avg_op_scan_flush_trigger(rocksdb_options_t*, + uint32_t); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_options_get_memtable_avg_op_scan_flush_trigger(rocksdb_options_t*); enum { rocksdb_statistics_level_disable_all = 0, @@ -1362,13 +1668,6 @@ rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, unsigned char val); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt); -extern ROCKSDB_LIBRARY_API void -rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( - rocksdb_options_t* opt, unsigned char val); -extern ROCKSDB_LIBRARY_API unsigned char -rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( - rocksdb_options_t* opt); - /* Blob Options Settings */ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files( rocksdb_options_t* opt, unsigned char val); @@ -1448,11 +1747,6 @@ rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API int rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void -rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*, - int); -extern ROCKSDB_LIBRARY_API int -rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*); -extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*, int64_t); extern ROCKSDB_LIBRARY_API int64_t @@ -1752,6 +2046,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio( rocksdb_options_t*, size_t window_size, size_t num_dels_trigger, double deletion_ratio); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_add_compact_on_deletion_collector_factory_min_file_size( + rocksdb_options_t*, size_t window_size, size_t num_dels_trigger, + double deletion_ratio, uint64_t min_file_size); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush( rocksdb_options_t* opt, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush( @@ -1875,7 +2173,8 @@ enum { rocksdb_blob_decompress_time, rocksdb_internal_range_del_reseek_count, rocksdb_block_read_cpu_time, - rocksdb_total_metric_count = 79 + rocksdb_internal_merge_point_lookup_count, + rocksdb_total_metric_count = 80 }; extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); @@ -2012,9 +2311,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing( rocksdb_readoptions_t*, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing( rocksdb_readoptions_t*); -// The functionality that this option controlled has been removed. -extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed( - rocksdb_readoptions_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size( rocksdb_readoptions_t*, size_t); extern ROCKSDB_LIBRARY_API size_t @@ -2225,6 +2521,51 @@ extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_buffer_size( extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall( rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall); +/* SstFileManager */ + +extern ROCKSDB_LIBRARY_API rocksdb_sst_file_manager_t* +rocksdb_sst_file_manager_create(rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_manager_destroy( + rocksdb_sst_file_manager_t* sfm); + +extern ROCKSDB_LIBRARY_API void +rocksdb_sst_file_manager_set_max_allowed_space_usage( + rocksdb_sst_file_manager_t* sfm, uint64_t max_allowed_space); + +extern ROCKSDB_LIBRARY_API void +rocksdb_sst_file_manager_set_compaction_buffer_size( + rocksdb_sst_file_manager_t* sfm, uint64_t compaction_buffer_size); + +extern ROCKSDB_LIBRARY_API bool +rocksdb_sst_file_manager_is_max_allowed_space_reached( + rocksdb_sst_file_manager_t* sfm); + +extern ROCKSDB_LIBRARY_API bool +rocksdb_sst_file_manager_is_max_allowed_space_reached_including_compactions( + rocksdb_sst_file_manager_t* sfm); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_sst_file_manager_get_total_size(rocksdb_sst_file_manager_t* sfm); + +extern ROCKSDB_LIBRARY_API int64_t +rocksdb_sst_file_manager_get_delete_rate_bytes_per_second( + rocksdb_sst_file_manager_t* sfm); + +extern ROCKSDB_LIBRARY_API void +rocksdb_sst_file_manager_set_delete_rate_bytes_per_second( + rocksdb_sst_file_manager_t* sfm, int64_t delete_rate); + +extern ROCKSDB_LIBRARY_API double +rocksdb_sst_file_manager_get_max_trash_db_ratio( + rocksdb_sst_file_manager_t* sfm); + +extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_manager_set_max_trash_db_ratio( + rocksdb_sst_file_manager_t* sfm, double ratio); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_sst_file_manager_get_total_trash_size(rocksdb_sst_file_manager_t* sfm); + /* HyperClockCache */ extern ROCKSDB_LIBRARY_API rocksdb_hyper_clock_cache_options_t* @@ -2381,10 +2722,9 @@ rocksdb_slicetransform_create( char* (*transform)(void*, const char* key, size_t length, size_t* dst_length), unsigned char (*in_domain)(void*, const char* key, size_t length), - unsigned char (*in_range)(void*, const char* key, size_t length), const char* (*name)(void*)); extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* - rocksdb_slicetransform_create_fixed_prefix(size_t); +rocksdb_slicetransform_create_fixed_prefix(size_t); extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop(void); extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy( @@ -2453,15 +2793,32 @@ rocksdb_fifo_compaction_options_set_max_table_files_size( extern ROCKSDB_LIBRARY_API uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size( rocksdb_fifo_compaction_options_t* fifo_opts); +extern ROCKSDB_LIBRARY_API void +rocksdb_fifo_compaction_options_set_max_data_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_fifo_compaction_options_get_max_data_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts); +extern ROCKSDB_LIBRARY_API void +rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts, + unsigned char use_kv_ratio_compaction); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts); extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy( rocksdb_fifo_compaction_options_t* fifo_opts); +extern ROCKSDB_LIBRARY_API rocksdb_livefiles_t* rocksdb_livefiles_create(void); + extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count( const rocksdb_livefiles_t*); extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name( const rocksdb_livefiles_t*, int index); extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name( const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_directory( + const rocksdb_livefiles_t*, int index); extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level( const rocksdb_livefiles_t*, int index); extern ROCKSDB_LIBRARY_API size_t @@ -2471,12 +2828,44 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey( extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey( const rocksdb_livefiles_t*, int index, size_t* size); extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_livefiles_smallest_seqno(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_livefiles_largest_seqno(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index); extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index); extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy( const rocksdb_livefiles_t*); +extern ROCKSDB_LIBRARY_API rocksdb_livefile_t* rocksdb_livefile_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_column_family_name( + rocksdb_livefile_t*, const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_level(rocksdb_livefile_t*, + int); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_name(rocksdb_livefile_t*, + const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_directory( + rocksdb_livefile_t*, const char*); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_size(rocksdb_livefile_t*, + size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_smallest_key( + rocksdb_livefile_t*, const char*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_largest_key( + rocksdb_livefile_t*, const char*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_smallest_seqno( + rocksdb_livefile_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_largest_seqno( + rocksdb_livefile_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_num_entries( + rocksdb_livefile_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_num_deletions( + rocksdb_livefile_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_livefile_destroy(rocksdb_livefile_t*); + +extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_add(rocksdb_livefiles_t*, + rocksdb_livefile_t*); + /* Utility Helpers */ extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string( @@ -2497,6 +2886,37 @@ extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf( extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata(rocksdb_t* db); +extern ROCKSDB_LIBRARY_API rocksdb_import_column_family_options_t* +rocksdb_import_column_family_options_create(void); + +extern ROCKSDB_LIBRARY_API void +rocksdb_import_column_family_options_set_move_files( + rocksdb_import_column_family_options_t*, unsigned char); + +extern ROCKSDB_LIBRARY_API void rocksdb_import_column_family_options_destroy( + rocksdb_import_column_family_options_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_export_import_files_metadata_t* +rocksdb_export_import_files_metadata_create(void); + +extern ROCKSDB_LIBRARY_API char* +rocksdb_export_import_files_metadata_get_db_comparator_name( + rocksdb_export_import_files_metadata_t*); + +extern ROCKSDB_LIBRARY_API void +rocksdb_export_import_files_metadata_set_db_comparator_name( + rocksdb_export_import_files_metadata_t*, const char*); + +extern ROCKSDB_LIBRARY_API rocksdb_livefiles_t* +rocksdb_export_import_files_metadata_get_files( + rocksdb_export_import_files_metadata_t*); + +extern ROCKSDB_LIBRARY_API void rocksdb_export_import_files_metadata_set_files( + rocksdb_export_import_files_metadata_t*, rocksdb_livefiles_t*); + +extern ROCKSDB_LIBRARY_API void rocksdb_export_import_files_metadata_destroy( + rocksdb_export_import_files_metadata_t*); + /** * Returns the rocksdb_column_family_metadata_t of the specified * column family. @@ -3130,6 +3550,266 @@ extern ROCKSDB_LIBRARY_API uint64_t rocksdb_wait_for_compact_options_get_timeout( rocksdb_wait_for_compact_options_t* opt); +/* High-performance zero-copy Get variants + These functions avoid unnecessary memory allocations and copies. + The returned buffer is valid until the handle is destroyed. + Bindings should migrate to these for better performance. */ + +/* Zero-copy get that returns a handle to pinned data. + The data remains valid until rocksdb_pinnable_handle_destroy is called. + Returns NULL on error or not found. Check errptr to distinguish. */ +typedef struct rocksdb_pinnable_handle_t rocksdb_pinnable_handle_t; + +extern ROCKSDB_LIBRARY_API rocksdb_pinnable_handle_t* rocksdb_get_pinned_v2( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_pinnable_handle_t* rocksdb_get_pinned_cf_v2( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr); + +/* Get the data pointer and size from a pinnable handle. + The data pointer is valid until the handle is destroyed. */ +extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnable_handle_get_value( + const rocksdb_pinnable_handle_t* handle, size_t* vallen); + +extern ROCKSDB_LIBRARY_API void rocksdb_pinnable_handle_destroy( + rocksdb_pinnable_handle_t* handle); + +/* Direct get into caller-provided buffer. + Returns 1 if value fits in buffer, 0 if buffer too small. + Sets *vallen to actual value size. + If buffer is too small, no data is copied but *vallen is set. */ +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_get_into_buffer( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char* buffer, size_t buffer_size, size_t* vallen, + unsigned char* found, char** errptr); + +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_get_into_buffer_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char* buffer, size_t buffer_size, size_t* vallen, + unsigned char* found, char** errptr); + +// Remote compaction +enum { + rocksdb_compactionservice_jobstatus_success = 0, + rocksdb_compactionservice_jobstatus_failure = 1, + rocksdb_compactionservice_jobstatus_aborted = 2, + rocksdb_compactionservice_jobstatus_use_local = 3, +}; + +extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_scheduleresponse_t* +rocksdb_compactionservice_scheduleresponse_create(const char* scheduled_job_id, + int status, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_scheduleresponse_t* +rocksdb_compactionservice_scheduleresponse_create_with_status(int status, + char** errptr); + +extern ROCKSDB_LIBRARY_API int +rocksdb_compactionservice_scheduleresponse_getstatus( + const rocksdb_compactionservice_scheduleresponse_t* response); + +extern ROCKSDB_LIBRARY_API const char* +rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id( + const rocksdb_compactionservice_scheduleresponse_t* response, size_t* len); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compactionservice_scheduleresponse_t_destroy( + rocksdb_compactionservice_scheduleresponse_t* response); + +extern ROCKSDB_LIBRARY_API const char* +rocksdb_compactionservice_jobinfo_t_get_db_name( + const rocksdb_compactionservice_jobinfo_t* info, size_t* len); + +extern ROCKSDB_LIBRARY_API const char* +rocksdb_compactionservice_jobinfo_t_get_db_id( + const rocksdb_compactionservice_jobinfo_t* info, size_t* len); + +extern ROCKSDB_LIBRARY_API const char* +rocksdb_compactionservice_jobinfo_t_get_db_session_id( + const rocksdb_compactionservice_jobinfo_t* info, size_t* len); + +extern ROCKSDB_LIBRARY_API const char* +rocksdb_compactionservice_jobinfo_t_get_cf_name( + const rocksdb_compactionservice_jobinfo_t* info, size_t* len); + +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_compactionservice_jobinfo_t_get_cf_id( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_compactionservice_jobinfo_t_get_job_id( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API int rocksdb_compactionservice_jobinfo_t_get_priority( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API int +rocksdb_compactionservice_jobinfo_t_get_compaction_reason( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API int +rocksdb_compactionservice_jobinfo_t_get_base_input_level( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API int +rocksdb_compactionservice_jobinfo_t_get_output_level( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionservice_jobinfo_t_is_full_compaction( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionservice_jobinfo_t_is_manual_compaction( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactionservice_jobinfo_t_is_bottommost_level( + const rocksdb_compactionservice_jobinfo_t* info); + +extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_t* +rocksdb_compactionservice_create( + void* state, void (*destructor)(void*), + rocksdb_compaction_service_schedule_cb schedule, const char* name, + rocksdb_compaction_service_wait_cb wait, + rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs, + rocksdb_compaction_service_on_installation_cb on_installation); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_service( + rocksdb_options_t* options, rocksdb_compactionservice_t* service); + +// CompactionServiceOptionsOverride +extern ROCKSDB_LIBRARY_API rocksdb_compaction_service_options_override_t* +rocksdb_compaction_service_options_override_create(void); + +extern ROCKSDB_LIBRARY_API rocksdb_compaction_service_options_override_t* +rocksdb_compaction_service_options_override_create_from_options( + rocksdb_options_t* option); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_destroy( + rocksdb_compaction_service_options_override_t* override_options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_env( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_comparator( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_comparator_t* comparator); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_merge_operator( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_mergeoperator_t* merge_operator); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_compaction_filter( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_compactionfilter_t* compaction_filter); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_compaction_filter_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_compactionfilterfactory_t* compaction_filter_factory); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_prefix_extractor( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_slicetransform_t* prefix_extractor); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_block_based_table_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_block_based_table_options_t* table_options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_cuckoo_table_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_cuckoo_table_options_t* table_options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_add_event_listener( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_eventlistener_t* event_listener); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_statistics( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_info_log( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_logger_t* logger); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_option( + rocksdb_compaction_service_options_override_t* override_options, + const char* key, const char* value); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_file_checksum_gen_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_file_checksum_gen_factory_t* factory); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_set_sst_partitioner_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_sst_partitioner_factory_t* factory); + +extern ROCKSDB_LIBRARY_API void +rocksdb_compaction_service_options_override_add_table_properties_collector_factory( + rocksdb_compaction_service_options_override_t* override_options, + rocksdb_table_properties_collector_factory_t* factory); + +// Atomic bool management for cancellation +// Creates an atomic bool that can be used for cancellation. +// User must call rocksdb_open_and_compact_canceled_destroy() to free it. +extern ROCKSDB_LIBRARY_API unsigned char* +rocksdb_open_and_compact_canceled_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_canceled_destroy( + unsigned char* canceled); + +extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_canceled_set( + unsigned char* canceled, unsigned char value); + +// OpenAndCompactOptions +extern ROCKSDB_LIBRARY_API rocksdb_open_and_compact_options_t* +rocksdb_open_and_compact_options_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_options_destroy( + rocksdb_open_and_compact_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_options_set_canceled( + rocksdb_open_and_compact_options_t* options, unsigned char* canceled); + +extern ROCKSDB_LIBRARY_API void +rocksdb_open_and_compact_options_set_allow_resumption( + rocksdb_open_and_compact_options_t* options, + unsigned char allow_resumption); + +// OpenAndCompact - main functions +extern ROCKSDB_LIBRARY_API char* rocksdb_open_and_compact( + const char* db_path, const char* output_directory, const char* input, + size_t input_len, size_t* output_len, + const rocksdb_compaction_service_options_override_t* override_options, + char** errptr); + +extern ROCKSDB_LIBRARY_API char* rocksdb_open_and_compact_with_options( + const rocksdb_open_and_compact_options_t* options, const char* db_path, + const char* output_directory, const char* input, size_t input_len, + size_t* output_len, + const rocksdb_compaction_service_options_override_t* override_options, + char** errptr); + #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 54e9e88aacba..f52d5246bbfe 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -210,7 +210,15 @@ struct ShardedCacheOptions { // shard has its own LRU list for evictions. Each shard also has a mutex for // exclusive access during operations; even read operations need exclusive // access in order to update the LRU list. Mutex contention is usually low -// with enough shards. +// with enough shards. However, +// * For a single hot block, there will be mutex contention even for reads +// regardless of the number of shards. +// * LRUCaches in the size of MBs instead of GBs can have shards small enough +// that there is a random probability of some modest number of large blocks +// (especially non-partitioned filters) thrashing a single cache shard. +// +// HYPERCLOCKCACHE IS NOW GENERALLY RECOMMENDED OVER LRUCACHE. See +// HyperClockCacheOptions below. struct LRUCacheOptions : public ShardedCacheOptions { // Ratio of cache reserved for high-priority and low-priority entries, // respectively. (See Cache::Priority below more information on the levels.) @@ -298,13 +306,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions { // Options specific to the compression algorithm CompressionOptions compression_opts; - // compress_format_version can have two values: - // compress_format_version == 1 -- decompressed size is not included in the - // block header. - // compress_format_version == 2 -- decompressed size is included in the block - // header in varint32 format. - uint32_t compress_format_version = 2; - // Enable the custom split and merge feature, which split the compressed value // into chunks so that they may better fit jemalloc bins. bool enable_custom_split_merge = false; @@ -322,7 +323,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions { CacheMetadataChargePolicy _metadata_charge_policy = kDefaultCacheMetadataChargePolicy, CompressionType _compression_type = CompressionType::kLZ4Compression, - uint32_t _compress_format_version = 2, bool _enable_custom_split_merge = false, const CacheEntryRoleSet& _do_not_compress_roles = {CacheEntryRole::kFilterBlock}) @@ -331,7 +331,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions { _use_adaptive_mutex, _metadata_charge_policy, _low_pri_pool_ratio), compression_type(_compression_type), - compress_format_version(_compress_format_version), enable_custom_split_merge(_enable_custom_split_merge), do_not_compress_roles(_do_not_compress_roles) {} @@ -352,7 +351,6 @@ inline std::shared_ptr NewCompressedSecondaryCache( CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy, CompressionType compression_type = CompressionType::kLZ4Compression, - uint32_t compress_format_version = 2, bool enable_custom_split_merge = false, const CacheEntryRoleSet& _do_not_compress_roles = { CacheEntryRole::kFilterBlock}) { @@ -360,8 +358,7 @@ inline std::shared_ptr NewCompressedSecondaryCache( capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator, use_adaptive_mutex, metadata_charge_policy, compression_type, - compress_format_version, enable_custom_split_merge, - _do_not_compress_roles) + enable_custom_split_merge, _do_not_compress_roles) .MakeSharedSecondaryCache(); } @@ -371,64 +368,50 @@ inline std::shared_ptr NewCompressedSecondaryCache( return opts.MakeSharedSecondaryCache(); } -// HyperClockCache - A lock-free Cache alternative for RocksDB block cache -// that offers much improved CPU efficiency vs. LRUCache under high parallel -// load or high contention, with some caveats: +// HyperClockCache (also known as HCC) - A lock-free Cache alternative for +// RocksDB block cache that offers much improved CPU efficiency vs. LRUCache +// under high parallel load or high contention. Additionally, HCC only uses +// sharding for a modest performance boost, so can use much larger cache shards +// than LRUCache, dramatically reducing the risk of thrashing in configurations +// or work loads with some large blocks. +// +// HYPERCLOCKCACHE IS NOW GENERALLY RECOMMENDED OVER LRUCACHE +// +// Some caveats: // * Not a general Cache implementation: can only be used for // BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is // compatible with HyperClockCache. -// * Requires an extra tuning parameter: see estimated_entry_charge below. -// Similarly, substantially changing the capacity with SetCapacity could -// harm efficiency. -> EXPERIMENTAL: the tuning parameter can be set to 0 -// to find the appropriate balance automatically. // * Cache priorities are less aggressively enforced, which could cause // cache dilution from long range scans (unless they use fill_cache=false). +// * In some configurations, depends on anonymous mmap support, available in +// Linux, Windows and more. +// * May have slightly lower (or slightly higher) cache hit rate vs. LRUCache, +// because of the bounded counting-CLOCK eviction algorithm. // // See internal cache/clock_cache.h for full description. struct HyperClockCacheOptions : public ShardedCacheOptions { - // The estimated average `charge` associated with cache entries. - // - // EXPERIMENTAL: the field can be set to 0 to size the table dynamically - // and automatically. See also min_avg_entry_charge. This feature requires - // platform support for lazy anonymous memory mappings (incl Linux, Windows). - // Performance is very similar to choosing the best configuration parameter. - // - // PRODUCTION-TESTED: This is a critical configuration parameter for good - // performance, because having a table size that is fixed at creation time - // greatly reduces the required synchronization between threads. - // * If the estimate is substantially too low (e.g. less than half the true - // average) then metadata space overhead with be substantially higher (e.g. - // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this - // can slightly reduce cache hit rates, and slightly reduce access times due - // to the larger working memory size. - // * If the estimate is substantially too high (e.g. 25% higher than the true - // average) then there might not be sufficient slots in the hash table for - // both efficient operation and capacity utilization (hit rate). The hyper - // cache will evict entries to prevent load factors that could dramatically - // affect lookup times, instead letting the hit rate suffer by not utilizing - // the full capacity. + // OPTIONAL: The estimated average `charge` associated with cache entries. // - // A reasonable choice is the larger of block_size and metadata_block_size. - // When WriteBufferManager (and similar) charge memory usage to the block - // cache, this can lead to the same effect as estimate being too low, which - // is better than the opposite. Therefore, the general recommendation is to - // assume that other memory charged to block cache could be negligible, and - // ignore it in making the estimate. + // When not provided (== 0, recommended and default), an HCC variant with a + // dynamically-growing table and generally good performance is used. This + // variant depends on anonymous mmaps so might not be available on all + // platforms. // - // The best parameter choice based on a cache in use is given by - // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as - // with kDontChargeCacheMetadata. More precisely with - // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) / - // GetOccupancyCount(). However, when the average value size might vary - // (e.g. balance between metadata and data blocks in cache), it is better - // to estimate toward the lower side than the higher side. + // If the average "charge" (uncompressed block size) of block cache entries + // is reasonably predicted and provided here, the most efficient variant of + // HCC is used. Performance is degraded if the prediction is inaccurate. + // Prediction could be difficult or impossible with cache-charging features + // such as WriteBufferManager. The best parameter choice based on a cache + // in use is roughly given by GetUsage() / GetOccupancyCount(), though it is + // better to estimate toward the lower side than the higher side when the + // ratio might vary. size_t estimated_entry_charge; - // EXPERIMENTAL: When estimated_entry_charge == 0, this parameter establishes - // a promised lower bound on the average charge of all entries in the table, - // which is roughly the average uncompressed SST block size of block cache - // entries, typically > 4KB. The default should generally suffice with almost - // no cost. (This option is ignored for estimated_entry_charge > 0.) + // When estimated_entry_charge == 0, this parameter establishes a promised + // lower bound on the average charge of all entries in the table, which is + // roughly the average uncompressed SST block size of block cache entries, + // typically > 4KB. The default should generally suffice with almost no cost. + // (This option is ignored for estimated_entry_charge > 0.) // // More detail: The table for indexing cache entries will grow automatically // as needed, but a hard upper bound on that size is needed at creation time. @@ -478,8 +461,8 @@ struct HyperClockCacheOptions : public ShardedCacheOptions { // keep operations very fast. int eviction_effort_cap = 30; - HyperClockCacheOptions( - size_t _capacity, size_t _estimated_entry_charge, + explicit HyperClockCacheOptions( + size_t _capacity, size_t _estimated_entry_charge = 0, int _num_shard_bits = -1, bool _strict_capacity_limit = false, std::shared_ptr _memory_allocator = nullptr, CacheMetadataChargePolicy _metadata_charge_policy = diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index 66f2f390e7d1..68a7116de9bd 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -284,9 +284,7 @@ class CompactionFilter : public Customizable { std::string* new_value, std::vector>* /* new_columns */, std::string* skip_until) const { -#ifdef NDEBUG (void)existing_columns; -#endif assert(!existing_value || !existing_columns); assert(value_type == ValueType::kWideColumnEntity || existing_value); diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h index 91709795a176..c9476d70a78d 100644 --- a/include/rocksdb/compaction_job_stats.h +++ b/include/rocksdb/compaction_job_stats.h @@ -24,15 +24,18 @@ struct CompactionJobStats { // the elapsed CPU time of this compaction in microseconds. uint64_t cpu_micros = 0; - // Used internally indicating whether a subcompaction's - // `num_input_records` is accurate. - bool has_num_input_records = false; + // True if `num_input_records` is accurate across all subcompactions. + // See CompactionIterator::must_count_input_entries for some implementation + // details why `num_input_records` may not be accurate. + bool has_accurate_num_input_records = true; // the number of compaction input records. uint64_t num_input_records = 0; // the number of blobs read from blob files uint64_t num_blobs_read = 0; // the number of compaction input files (table files) size_t num_input_files = 0; + // The number of input files that get trivially moved. + size_t num_input_files_trivially_moved = 0; // the number of compaction input files at the output level (table files) size_t num_input_files_at_output_level = 0; // the number of compaction input files that are filtered out by compaction @@ -118,6 +121,6 @@ struct CompactionJobStats { // number of single-deletes which meet something other than a put uint64_t num_single_del_mismatch = 0; - // TODO: Add output_to_penultimate_level output information + // TODO: Add output_to_proximal_level output information }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h index 2ca742aa3853..2261a44439b9 100644 --- a/include/rocksdb/compression_type.h +++ b/include/rocksdb/compression_type.h @@ -18,14 +18,148 @@ namespace ROCKSDB_NAMESPACE { enum CompressionType : unsigned char { // NOTE: do not change the values of existing entries, as these are // part of the persistent format on disk. - kNoCompression = 0x0, - kSnappyCompression = 0x1, - kZlibCompression = 0x2, - kBZip2Compression = 0x3, - kLZ4Compression = 0x4, - kLZ4HCCompression = 0x5, - kXpressCompression = 0x6, - kZSTD = 0x7, + kNoCompression = 0x00, + kSnappyCompression = 0x01, + kZlibCompression = 0x02, + kBZip2Compression = 0x03, + kLZ4Compression = 0x04, + kLZ4HCCompression = 0x05, + kXpressCompression = 0x06, + kZSTD = 0x07, + kLastBuiltinCompression = kZSTD, + + // Reserved for future use: up to 0x7F + + // For use by user custom CompressionManagers + kCustomCompression80 = 0x80, + kFirstCustomCompression = kCustomCompression80, + kCustomCompression81 = 0x81, + kCustomCompression82 = 0x82, + kCustomCompression83 = 0x83, + kCustomCompression84 = 0x84, + kCustomCompression85 = 0x85, + kCustomCompression86 = 0x86, + kCustomCompression87 = 0x87, + kCustomCompression88 = 0x88, + kCustomCompression89 = 0x89, + kCustomCompression8A = 0x8A, + kCustomCompression8B = 0x8B, + kCustomCompression8C = 0x8C, + kCustomCompression8D = 0x8D, + kCustomCompression8E = 0x8E, + kCustomCompression8F = 0x8F, + kCustomCompression90 = 0x90, + kCustomCompression91 = 0x91, + kCustomCompression92 = 0x92, + kCustomCompression93 = 0x93, + kCustomCompression94 = 0x94, + kCustomCompression95 = 0x95, + kCustomCompression96 = 0x96, + kCustomCompression97 = 0x97, + kCustomCompression98 = 0x98, + kCustomCompression99 = 0x99, + kCustomCompression9A = 0x9A, + kCustomCompression9B = 0x9B, + kCustomCompression9C = 0x9C, + kCustomCompression9D = 0x9D, + kCustomCompression9E = 0x9E, + kCustomCompression9F = 0x9F, + kCustomCompressionA0 = 0xA0, + kCustomCompressionA1 = 0xA1, + kCustomCompressionA2 = 0xA2, + kCustomCompressionA3 = 0xA3, + kCustomCompressionA4 = 0xA4, + kCustomCompressionA5 = 0xA5, + kCustomCompressionA6 = 0xA6, + kCustomCompressionA7 = 0xA7, + kCustomCompressionA8 = 0xA8, + kCustomCompressionA9 = 0xA9, + kCustomCompressionAA = 0xAA, + kCustomCompressionAB = 0xAB, + kCustomCompressionAC = 0xAC, + kCustomCompressionAD = 0xAD, + kCustomCompressionAE = 0xAE, + kCustomCompressionAF = 0xAF, + kCustomCompressionB0 = 0xB0, + kCustomCompressionB1 = 0xB1, + kCustomCompressionB2 = 0xB2, + kCustomCompressionB3 = 0xB3, + kCustomCompressionB4 = 0xB4, + kCustomCompressionB5 = 0xB5, + kCustomCompressionB6 = 0xB6, + kCustomCompressionB7 = 0xB7, + kCustomCompressionB8 = 0xB8, + kCustomCompressionB9 = 0xB9, + kCustomCompressionBA = 0xBA, + kCustomCompressionBB = 0xBB, + kCustomCompressionBC = 0xBC, + kCustomCompressionBD = 0xBD, + kCustomCompressionBE = 0xBE, + kCustomCompressionBF = 0xBF, + kCustomCompressionC0 = 0xC0, + kCustomCompressionC1 = 0xC1, + kCustomCompressionC2 = 0xC2, + kCustomCompressionC3 = 0xC3, + kCustomCompressionC4 = 0xC4, + kCustomCompressionC5 = 0xC5, + kCustomCompressionC6 = 0xC6, + kCustomCompressionC7 = 0xC7, + kCustomCompressionC8 = 0xC8, + kCustomCompressionC9 = 0xC9, + kCustomCompressionCA = 0xCA, + kCustomCompressionCB = 0xCB, + kCustomCompressionCC = 0xCC, + kCustomCompressionCD = 0xCD, + kCustomCompressionCE = 0xCE, + kCustomCompressionCF = 0xCF, + kCustomCompressionD0 = 0xD0, + kCustomCompressionD1 = 0xD1, + kCustomCompressionD2 = 0xD2, + kCustomCompressionD3 = 0xD3, + kCustomCompressionD4 = 0xD4, + kCustomCompressionD5 = 0xD5, + kCustomCompressionD6 = 0xD6, + kCustomCompressionD7 = 0xD7, + kCustomCompressionD8 = 0xD8, + kCustomCompressionD9 = 0xD9, + kCustomCompressionDA = 0xDA, + kCustomCompressionDB = 0xDB, + kCustomCompressionDC = 0xDC, + kCustomCompressionDD = 0xDD, + kCustomCompressionDE = 0xDE, + kCustomCompressionDF = 0xDF, + kCustomCompressionE0 = 0xE0, + kCustomCompressionE1 = 0xE1, + kCustomCompressionE2 = 0xE2, + kCustomCompressionE3 = 0xE3, + kCustomCompressionE4 = 0xE4, + kCustomCompressionE5 = 0xE5, + kCustomCompressionE6 = 0xE6, + kCustomCompressionE7 = 0xE7, + kCustomCompressionE8 = 0xE8, + kCustomCompressionE9 = 0xE9, + kCustomCompressionEA = 0xEA, + kCustomCompressionEB = 0xEB, + kCustomCompressionEC = 0xEC, + kCustomCompressionED = 0xED, + kCustomCompressionEE = 0xEE, + kCustomCompressionEF = 0xEF, + kCustomCompressionF0 = 0xF0, + kCustomCompressionF1 = 0xF1, + kCustomCompressionF2 = 0xF2, + kCustomCompressionF3 = 0xF3, + kCustomCompressionF4 = 0xF4, + kCustomCompressionF5 = 0xF5, + kCustomCompressionF6 = 0xF6, + kCustomCompressionF7 = 0xF7, + kCustomCompressionF8 = 0xF8, + kCustomCompressionF9 = 0xF9, + kCustomCompressionFA = 0xFA, + kCustomCompressionFB = 0xFB, + kCustomCompressionFC = 0xFC, + kCustomCompressionFD = 0xFD, + kCustomCompressionFE = 0xFE, + kLastCustomCompression = kCustomCompressionFE, // kDisableCompressionOption is used to disable some compression options. kDisableCompressionOption = 0xff, @@ -92,11 +226,15 @@ struct CompressionOptions { // The training data will be used to generate a dictionary of max_dict_bytes. uint32_t zstd_max_train_bytes = 0; - // Number of threads for parallel compression. - // Parallel compression is enabled only if threads > 1. - // THE FEATURE IS STILL EXPERIMENTAL + // Number of threads for parallel compression for each running flush or + // compaction job. Parallel compression is enabled only if threads > 1. Not + // recommended for lightweight compression algorithms such as Snappy, LZ4, and + // obviously kNoCompression because there is unlikely to be a throughput gain. // - // This option is valid only when BlockBasedTable is used. + // This option is valid only when BlockBasedTable is used and is disabled + // (sanitized to 1) with any of these: + // * User-defined index (UserDefinedIndexFactory) + // * partition_filters == true && decouple_partitioned_filters == false // // When parallel compression is enabled, SST size file sizes might be // more inflated compared to the target size, because more data of unknown @@ -175,9 +313,10 @@ struct CompressionOptions { max_compressed_bytes_per_kb = static_cast(1024.0 / min_ratio + 0.5); } -#if __cplusplus >= 202002L bool operator==(const CompressionOptions& rhs) const = default; -#endif }; +// See advanced_compression.h +class CompressionManager; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index 27127fbebfbf..95bfe2c692b6 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -450,6 +450,22 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, // Delete files in multiple ranges at once // Delete files in a lot of ranges one at a time can be slow, use this API for // better performance in that case. +Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, + const RangeOpt* ranges, size_t n, + bool include_end = true); + +// DEPRECATED +struct RangePtr { + // In case of user_defined timestamp, if enabled, `start` and `limit` should + // point to key without timestamp part. + const Slice* start; + const Slice* limit; + + RangePtr() : start(nullptr), limit(nullptr) {} + RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {} +}; + +// DEPRECATED Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end = true); diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h index ffab82c514a5..7563a83abfcf 100644 --- a/include/rocksdb/data_structure.h +++ b/include/rocksdb/data_structure.h @@ -7,34 +7,48 @@ #include +#include #include #include -#include +#include +#include +#include +#include "rocksdb/comparator.h" #include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { namespace detail { int CountTrailingZeroBitsForSmallEnumSet(uint64_t); +int BitsSetToOneForSmallEnumSet(uint64_t); } // namespace detail -// Represents a set of values of some enum type with a small number of -// possible enumerators. For now, it supports enums where no enumerator -// exceeds 63 when converted to int. +// Represents a set of values of some enum type with a small number of possible +// enumerators. Assumes that any combination of enumerators with values 0 +// through MAX_ENUMERATOR (inclusive) might be part of the set. NOTE: would like +// to use std::bitset, but it doesn't support constexpr (in C++17) operations +// and doesn't support efficient iteration over sparse "set to true" entries. template class SmallEnumSet { private: - using StateT = uint64_t; - static constexpr int kStateBits = sizeof(StateT) * 8; - static constexpr int kMaxMax = kStateBits - 1; static constexpr int kMaxValue = static_cast(MAX_ENUMERATOR); static_assert(kMaxValue >= 0); - static_assert(kMaxValue <= kMaxMax); + static_assert(kMaxValue < 1024, "MAX_ENUMERATOR is suspiciously large"); + using PieceT = uint64_t; + static constexpr int kPieceBits = 64; + static constexpr int kPieceMask = 63; + static constexpr int kPieceShift = 6; + static constexpr int kPieceCount = kMaxValue / kPieceBits + 1; + using StateT = std::array; + static constexpr int kStateBits = kPieceBits * kPieceCount; + static_assert(kStateBits == sizeof(StateT) * 8); + static_assert(kMaxValue <= kStateBits - 1); public: - // construct / create - SmallEnumSet() : state_(0) {} + // construct / create empty set + SmallEnumSet() : state_{} {} template /*implicit*/ constexpr SmallEnumSet(const ENUM_TYPE e, TRest... rest) { @@ -44,8 +58,16 @@ class SmallEnumSet { // Return the set that includes all valid values, assuming the enum // is "dense" (includes all values converting to 0 through kMaxValue) static constexpr SmallEnumSet All() { - StateT tmp = StateT{1} << kMaxValue; - return SmallEnumSet(RawStateMarker(), tmp | (tmp - 1)); + StateT tmp; + for (int i = 0; i < kPieceCount - 1; ++i) { + tmp[i] = ~PieceT{0}; + } + if constexpr (((kMaxValue + 1) & kPieceMask) != 0) { + tmp[kPieceCount - 1] = (PieceT{1} << ((kMaxValue + 1) & kPieceMask)) - 1; + } else { + tmp[kPieceCount - 1] = ~PieceT{0}; + } + return SmallEnumSet(RawStateMarker(), tmp); } // equality @@ -60,11 +82,17 @@ class SmallEnumSet { bool Contains(const ENUM_TYPE e) const { int value = static_cast(e); assert(value >= 0 && value <= kMaxValue); - StateT tmp = 1; - return state_ & (tmp << value); + return GetPiece(value) & (PieceT{1} << (value & kPieceMask)); } - bool empty() const { return state_ == 0; } + bool empty() const { + for (int i = 0; i < kPieceCount; ++i) { + if (state_[i] != 0) { + return false; + } + } + return true; + } // iterator class const_iterator { @@ -92,7 +120,7 @@ class SmallEnumSet { if (pos_ < kMaxValue) { pos_ = set_->SkipUnset(pos_ + 1); } else { - pos_ = kStateBits; + pos_ = kMaxValue + 1; } return *this; } @@ -118,7 +146,15 @@ class SmallEnumSet { const_iterator begin() const { return const_iterator(this, SkipUnset(0)); } - const_iterator end() const { return const_iterator(this, kStateBits); } + const_iterator end() const { return const_iterator(this, kMaxValue + 1); } + + size_t count() const { + size_t rv = 0; + for (int i = 0; i < kPieceCount; ++i) { + rv += static_cast(detail::BitsSetToOneForSmallEnumSet(state_[i])); + } + return rv; + } // mutable ops @@ -127,9 +163,10 @@ class SmallEnumSet { bool Add(const ENUM_TYPE e) { int value = static_cast(e); assert(value >= 0 && value <= kMaxValue); - StateT old_state = state_; - state_ |= (StateT{1} << value); - return old_state != state_; + PieceT& piece_ref = RefPiece(value); + PieceT old_piece = piece_ref; + piece_ref |= (PieceT{1} << (value & kPieceMask)); + return old_piece != piece_ref; } // Modifies the set (if needed) not to include the given value. Returns true @@ -137,18 +174,20 @@ class SmallEnumSet { bool Remove(const ENUM_TYPE e) { int value = static_cast(e); assert(value >= 0 && value <= kMaxValue); - StateT old_state = state_; - state_ &= ~(StateT{1} << value); - return old_state != state_; + PieceT& piece_ref = RefPiece(value); + PieceT old_piece = piece_ref; + piece_ref &= ~(PieceT{1} << (value & kPieceMask)); + return old_piece != piece_ref; } // applicative ops // Return a new set based on this one with the additional value(s) inserted constexpr SmallEnumSet With(const ENUM_TYPE e) const { - int value = static_cast(e); - assert(value >= 0 && value <= kMaxValue); - return SmallEnumSet(RawStateMarker(), state_ | (StateT{1} << value)); + assert(static_cast(e) >= 0 && static_cast(e) <= kMaxValue); + SmallEnumSet rv(*this); + rv.Add(e); + return rv; } template constexpr SmallEnumSet With(const ENUM_TYPE e1, const ENUM_TYPE e2, @@ -158,9 +197,10 @@ class SmallEnumSet { // Return a new set based on this one excluding the given value(s) constexpr SmallEnumSet Without(const ENUM_TYPE e) const { - int value = static_cast(e); - assert(value >= 0 && value <= kMaxValue); - return SmallEnumSet(RawStateMarker(), state_ & ~(StateT{1} << value)); + assert(static_cast(e) >= 0 && static_cast(e) <= kMaxValue); + SmallEnumSet rv(*this); + rv.Remove(e); + return rv; } template constexpr SmallEnumSet Without(const ENUM_TYPE e1, const ENUM_TYPE e2, @@ -170,17 +210,568 @@ class SmallEnumSet { private: int SkipUnset(int pos) const { - StateT tmp = state_ >> pos; - if (tmp == 0) { - return kStateBits; - } else { - return pos + detail::CountTrailingZeroBitsForSmallEnumSet(tmp); + while (pos <= kMaxValue) { + PieceT remainder = GetPiece(pos) >> (pos & kPieceMask); + if (remainder != 0) { + return pos + detail::CountTrailingZeroBitsForSmallEnumSet(remainder); + } + pos = (pos + kPieceBits) & ~kPieceMask; } + return kMaxValue + 1; } struct RawStateMarker {}; explicit SmallEnumSet(RawStateMarker, StateT state) : state_(state) {} + PieceT GetPiece(int pos) const { + if constexpr (kPieceCount == 1) { + return state_[0]; + } else { + return state_[pos >> kPieceShift]; + } + } + PieceT& RefPiece(int pos) { + if constexpr (kPieceCount == 1) { + return state_[0]; + } else { + return state_[pos >> kPieceShift]; + } + } StateT state_; }; +// A smart pointer that tracks an object and an owner, using a statically +// determined function on those to reclaim the object, if both object and owner +// are non-null +template +class ManagedPtr { + public: + ManagedPtr() = default; + ManagedPtr(T* ptr, Owner* owner) : ptr_(ptr), owner_(owner) {} + ~ManagedPtr() { Free(); } + // No copies + ManagedPtr(const ManagedPtr&) = delete; + ManagedPtr& operator=(const ManagedPtr&) = delete; + // Moves + ManagedPtr(ManagedPtr&& other) noexcept { + ptr_ = other.ptr_; + owner_ = other.owner_; + other.ptr_ = nullptr; + other.owner_ = nullptr; + } + ManagedPtr& operator=(ManagedPtr&& other) noexcept { + if (this == &other) { + return *this; + } + Free(); + ptr_ = other.ptr_; + owner_ = other.owner_; + other.ptr_ = nullptr; + other.owner_ = nullptr; + return *this; + } + + T* get() const { return ptr_; } + T* operator->() const { return ptr_; } + T& operator*() const { return *ptr_; } + operator bool() const { return ptr_ != nullptr; } + + Owner* owner() const { return owner_; } + + private: + T* ptr_ = nullptr; + Owner* owner_ = nullptr; + + void Free() { + if (ptr_ && owner_) { + if constexpr (std::is_member_function_pointer_v) { + (owner_->*Fn)(ptr_); + } else { + Fn(owner_, ptr_); + } + } + } +}; + +template +class Interval; + +// The Interval Class is a generic class for holding a range, for example [2, +// 4]. It can be used within the IntervalSet class, which is able to keep an +// ordered, non-intersecting set of intervals within it. Intervals can have +// open-ended end points, (i.e., to infinity) for example [2,). +template > +class Interval { + public: + enum class End { INF }; + struct CompareVariant { + comp comparator; + bool operator()(const std::variant& a, + const std::variant& b) const { + if (std::holds_alternative(a) && std::holds_alternative(b)) { + return comparator(std::get(a), std::get(b)); + } + if (std::holds_alternative(a) && std::holds_alternative(b)) { + return false; + } + if (std::holds_alternative(a) && std::holds_alternative(b)) { + return false; + } + return true; // std::holds_alternative(a) && + // std::holds_alternative(b) + } + }; + + /* implicit */ Interval(const T& start, const T& end) + : start_(start), end_(end) {} + /* implicit */ Interval(const T& start) : start_(start), end_(End::INF) {} + + // Add constructor that takes a pair + /* implicit */ Interval(const std::pair& p) + : start_(p.first), end_(p.second) {} + + T& start() { return start_; } + + const T& start() const { return start_; } + + bool has_end() const { return std::holds_alternative(end_); } + + T& end() { return std::get(end_); } + + const T& end() const { return std::get(end_); } + + // Support comparison with std::pair + bool operator==(const std::pair& p) const { + return start_ == p.first && has_end() && end() == p.second; + } + + // Support comparison with another Interval + bool operator==(const Interval& other) const { + if (start_ != other.start_) { + return false; + } + + // Both have infinite end + if (!has_end() && !other.has_end()) { + return true; + } + + // One has infinite end, the other doesn't + if (has_end() != other.has_end()) { + return false; + } + + // Both have finite end + return end() == other.end(); + } + + // Support comparison with another Interval + bool operator<(const Interval& other) const { + return comparator(start_, other.start_); + } + + bool Compare(const Interval& other) const { + return comparator(start_, other.start_); + } + + private: + T start_; + std::variant end_; + comp comparator; +}; + +// Specialized version of Interval for Slice +template <> +class Interval { + public: + enum class End { INF }; + + // Constructors that take a Comparator + /* implicit */ Interval(const Comparator* c, const Slice& start, + const Slice& end) + : start_(start), end_(end), comparator_(c) {} + + /* implicit */ Interval(const Comparator* c, const Slice& start) + : start_(start), end_(End::INF), comparator_(c) {} + + // Constructor that takes a pair + /* implicit */ Interval(const Comparator* c, const std::pair& p) + : start_(p.first), end_(p.second), comparator_(c) {} + + Slice& start() { return start_; } + + const Slice& start() const { return start_; } + + bool has_end() const { return std::holds_alternative(end_); } + + Slice& end() { return std::get(end_); } + + const Slice& end() const { return std::get(end_); } + + // Support comparison with std::pair + bool operator==(const std::pair& p) const { + return start_ == p.first && has_end() && end() == p.second; + } + + // Support comparison with another Interval + bool operator==(const Interval& other) const { + if (comparator_->Compare(start_, other.start_) != 0) { + return false; + } + + // Both have infinite end + if (!has_end() && !other.has_end()) { + return true; + } + + // One has infinite end, the other doesn't + if (has_end() != other.has_end()) { + return false; + } + + // Both have finite end + return comparator_->Compare(end(), other.end()) == 0; + } + + // Support comparison with another Interval + bool operator<(const Interval& other) const { + return comparator_->Compare(start_, other.start_) < 0; + } + + bool Compare(const Interval& other) const { + return comparator_->Compare(start_, other.start_) < 0; + } + + const Comparator* GetComparator() const { return comparator_; } + + private: + Slice start_; + std::variant end_; + const Comparator* comparator_; + + std::unordered_map property_bag; +}; + +template > +struct CompareInterval { + bool operator()(const Interval& a, + const Interval& b) const { + return a.Compare(b); + } +}; + +// IntervalSet will be used to represent a set of intervals (including unbounded +// ones). The intervals are unique and disjoint. Intervals that are inserted +// will merge with any range they intersect with. +template ::CompareVariant> +class IntervalSet { + public: + IntervalSet(Compare c = Compare()) : comp_(c) {} + + void insert(Interval&& i) { insertImpl(i); } + + void insert(const T& start, const T& end) { + insertImpl(Interval(start, end)); + } + + void insert(const T& start) { insertImpl(Interval(start)); } + + bool empty() const { return intervals_.empty(); } + void clear() { intervals_.clear(); } + + auto begin() { return intervals_.begin(); } + auto end() { return intervals_.end(); } + + auto cbegin() const { return intervals_.cbegin(); } + auto cend() const { return intervals_.cend(); } + + size_t size() const { return intervals_.size(); } + + private: + void insertImpl(const Interval& i) { + // Skip empty intervals + if (i.has_end() && !comp_(i.start(), i.end()) && + !comp_(i.end(), i.start())) { + return; + } + + // First, check if there's any infinite interval that would contain this one + for (auto it = intervals_.begin(); it != intervals_.end(); ++it) { + if (!it->has_end() && !comp_(i.start(), it->start())) { + // This interval starts at or after an infinite interval + return; + } + } + + // Find the position where the interval should be inserted + auto it = intervals_.begin(); + while (it != intervals_.end() && comp_(it->start(), i.start())) { + ++it; + } + + // Check if we need to consider the previous interval + if (it != intervals_.begin()) { + --it; + if (it->has_end() && comp_(it->end(), i.start())) { + ++it; + } + } + + T new_start = i.start(); + T new_end; + bool inf_end = false; + if (i.has_end()) { + new_end = i.end(); + } else { + // For infinite end intervals, we need to merge all intervals that start + // after new_start + std::vector to_erase; + while (it != intervals_.end()) { + new_start = comp_(it->start(), new_start) ? it->start() : new_start; + to_erase.push_back(it++); + } + + for (auto& eit : to_erase) { + intervals_.erase(eit); + } + + // Insert the new interval with infinite end + intervals_.insert(Interval(new_start)); + return; + } + + // For finite end intervals, proceed as before + std::vector to_erase; + while (it != intervals_.end() && !comp_(new_end, it->start())) { + if (it->has_end() && comp_(it->end(), new_start)) { + ++it; + continue; + } + new_start = comp_(it->start(), new_start) ? it->start() : new_start; + if (it->has_end()) { + new_end = comp_(new_end, it->end()) ? it->end() : new_end; + } else { + // If we encounter an interval with infinite end, our new interval also + // becomes infinite + inf_end = true; + break; + } + to_erase.push_back(it++); + } + + // Check for any infinite intervals that start after this one + auto check_it = it; + while (check_it != intervals_.end()) { + if (!check_it->has_end()) { + inf_end = true; + to_erase.push_back(check_it); + } + ++check_it; + } + + for (auto& eit : to_erase) { + intervals_.erase(eit); + } + + if (inf_end) { + intervals_.insert(Interval(new_start)); + } else { + intervals_.insert(Interval(new_start, new_end)); + } + } + + std::set, CompareInterval> intervals_; + Compare comp_; +}; + +// Specialization of IntervalSet for Slices. +// Slice based intervals can have properties attached to them. This is used to +// push down properties in the MultiScan API. We accept two modes with +// IntervalSet, fail_on_intersect, which imposes a restriction that inserted +// ranges will be disjoint, this is needed when using properties. Insert will +// fail if a range is found to not be disjoint. When fail_on_instersect is +// false, the ranges will be merged. +template <> +class IntervalSet { + public: + explicit IntervalSet(const Comparator* c, bool fail_on_intersect = false) + : comp_(c), prop_(fail_on_intersect) {} + + // Insert returns true if the interval was inserted. False indicates that the + // interval was not inserted, this could be do to an empty range OR that the + // IntervalSet is in with_properties mode and the interval overlaps with an + // existing interval. + bool insert(const Slice& start, const Slice& end) { + return insertImpl(Interval(comp_, start, end)); + } + + // Insert returns true if the interval was inserted. False indicates that the + // interval was not inserted, this could be do to an empty range OR that the + // IntervalSet is in with_properties mode and the interval overlaps with an + // existing interval. + bool insert(const Slice& start) { + // Create an interval with infinite end + Interval interval(comp_, start); + return insertImpl(interval); + } + + bool insert(Interval&& i) { return insertImpl(i); } + + bool empty() const { return intervals_.empty(); } + void clear() { intervals_.clear(); } + + auto begin() { return intervals_.begin(); } + auto end() { return intervals_.end(); } + + auto cbegin() const { return intervals_.cbegin(); } + auto cend() const { return intervals_.cend(); } + + size_t size() const { return intervals_.size(); } + + private: + // Custom comparator for finding intervals in the vector + struct IntervalComparator { + explicit IntervalComparator(const Comparator* comp) : comp_(comp) {} + + bool operator()(const Interval& a, + const Interval& b) const { + return comp_->Compare(a.start(), b.start()) < 0; + } + + const Comparator* comp_; + }; + + typename std::vector>::iterator findPosition( + const Interval& interval) { + // Find the position where the new interval should be inserted + for (auto it = intervals_.begin(); it != intervals_.end(); ++it) { + if (comp_->Compare(it->start(), interval.start()) >= 0) { + return it; + } + } + return intervals_.end(); + } + + bool insertImpl(const Interval& i) { + // Skip empty intervals + if (i.has_end() && comp_->Compare(i.start(), i.end()) >= 0) { + return false; + } + + // Find the position where this interval would be inserted + // This also checks if the interval is completely contained within an + // existing one + auto it = findPosition(i); + + // Check if we need to merge with previous interval + if (it != intervals_.begin()) { + auto prev = it - 1; + if (prev->has_end() && comp_->Compare(prev->end(), i.start()) < 0) { + // No overlap with previous interval + } else { + // There is overlap, adjust iterator to include previous interval + if (prop_) { + return false; + } + it = prev; + } + } + + Slice new_start = i.start(); + Slice new_end; + bool inf_end = false; + + if (i.has_end()) { + new_end = i.end(); + } else { + // For infinite end intervals, we need to merge all intervals that start + // after new_start + auto erase_start = it; + while (it != intervals_.end()) { + if (comp_->Compare(it->start(), new_start) < 0) { + if (prop_) { + return false; + } + new_start = it->start(); + } + ++it; + } + + // Erase all intervals from erase_start to end + if (erase_start != intervals_.end()) { + if (prop_) { + return false; + } + intervals_.erase(erase_start, intervals_.end()); + } + + // Insert the new interval with infinite end + Interval new_interval(comp_, new_start); + auto pos = findPosition(new_interval); + intervals_.insert(pos, new_interval); + return true; + } + + // For finite end intervals, find all overlapping intervals + auto erase_start = it; + auto erase_end = it; + + while (it != intervals_.end() && + comp_->Compare(new_end, it->start()) >= 0) { + if (it->has_end() && comp_->Compare(it->end(), new_start) < 0) { + // No overlap + ++it; + erase_end = it; + continue; + } + + if (comp_->Compare(it->start(), new_start) < 0) { + new_start = it->start(); + } + + if (it->has_end()) { + if (comp_->Compare(new_end, it->end()) < 0) { + new_end = it->end(); + } + } else { + // If we encounter an interval with infinite end, our new interval also + // becomes infinite + inf_end = true; + erase_end = intervals_.end(); + break; + } + + ++it; + erase_end = it; + } + + // Check for any infinite intervals that start after this one + while (it != intervals_.end()) { + if (!it->has_end()) { + inf_end = true; + erase_end = intervals_.end(); + break; + } + ++it; + } + + // Erase all merged intervals + if (erase_start != erase_end) { + intervals_.erase(erase_start, erase_end); + } + + // Insert the new merged interval + Interval new_interval = + inf_end ? Interval(comp_, new_start) + : Interval(comp_, new_start, new_end); + + auto pos = findPosition(new_interval); + intervals_.insert(pos, new_interval); + return true; + } + + const Comparator* comp_; + std::vector> intervals_; + bool prop_; +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 300af520ee9e..d31660de4ae4 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -22,6 +22,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/listener.h" #include "rocksdb/metadata.h" +#include "rocksdb/multi_scan.h" #include "rocksdb/options.h" #include "rocksdb/snapshot.h" #include "rocksdb/sst_file_writer.h" @@ -30,15 +31,10 @@ #include "rocksdb/types.h" #include "rocksdb/user_write_callback.h" #include "rocksdb/utilities/table_properties_collectors.h" +#include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/version.h" #include "rocksdb/wide_columns.h" -#if defined(__GNUC__) || defined(__clang__) -#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) -#elif _WIN32 -#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) -#endif - namespace ROCKSDB_NAMESPACE { struct ColumnFamilyOptions; @@ -47,6 +43,7 @@ struct CompactRangeOptions; struct DBOptions; struct ExternalSstFileInfo; struct FlushOptions; +struct FlushWALOptions; struct Options; struct ReadOptions; struct TableProperties; @@ -55,6 +52,7 @@ struct WaitForCompactOptions; class Env; class EventListener; class FileSystem; +class MultiScan; class Replayer; class StatsHistoryIterator; class TraceReader; @@ -93,45 +91,8 @@ class ColumnFamilyHandle { virtual const Comparator* GetComparator() const = 0; }; -static const int kMajorVersion = __ROCKSDB_MAJOR__; -static const int kMinorVersion = __ROCKSDB_MINOR__; - -// A range of keys -struct Range { - // In case of user_defined timestamp, if enabled, `start` and `limit` should - // point to key without timestamp part. - Slice start; - Slice limit; - - Range() {} - Range(const Slice& s, const Slice& l) : start(s), limit(l) {} -}; - -struct RangePtr { - // In case of user_defined timestamp, if enabled, `start` and `limit` should - // point to key without timestamp part. - const Slice* start; - const Slice* limit; - - RangePtr() : start(nullptr), limit(nullptr) {} - RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {} -}; - -// It is valid that files_checksums and files_checksum_func_names are both -// empty (no checksum information is provided for ingestion). Otherwise, -// their sizes should be the same as external_files. The file order should -// be the same in three vectors and guaranteed by the caller. -// Note that, we assume the temperatures of this batch of files to be -// ingested are the same. -struct IngestExternalFileArg { - ColumnFamilyHandle* column_family = nullptr; - std::vector external_files; - IngestExternalFileOptions options; - std::vector files_checksums; - std::vector files_checksum_func_names; - // A hint as to the temperature for *reading* the files to be ingested. - Temperature file_temperature = Temperature::kUnknown; -}; +static const int kMajorVersion = ROCKSDB_MAJOR; +static const int kMinorVersion = ROCKSDB_MINOR; struct GetMergeOperandsOptions { using ContinueCallback = std::function; @@ -170,24 +131,13 @@ using TablePropertiesCollection = class DB { public: // Open the database with the specified "name" for reads and writes. - // Stores a pointer to a heap-allocated database in *dbptr and returns - // OK on success. - // Stores nullptr in *dbptr and returns a non-OK status on error, including + // On success, stores the database in *dbptr and returns OK. + // On error, resets *dbptr and returns a non-OK status, including // if the DB is already open (read-write) by another DB object. (This // guarantee depends on options.env->LockFile(), which might not provide // this guarantee in a custom Env implementation.) - // - // Caller must delete *dbptr when it is no longer needed. static Status Open(const Options& options, const std::string& name, std::unique_ptr* dbptr); - // DEPRECATED: raw pointer variant - static Status Open(const Options& options, const std::string& name, - DB** dbptr) { - std::unique_ptr smart_ptr; - Status s = Open(options, name, &smart_ptr); - *dbptr = smart_ptr.release(); - return s; - } // Open DB with column families. // db_options specify database specific options @@ -201,21 +151,12 @@ class DB { // If everything is OK, handles will on return be the same size // as column_families --- handles[i] will be a handle that you // will use to operate on column family column_family[i]. - // Before delete DB, you have to close All column families by calling + // Before destroying the DB, you have to close all column families by calling // DestroyColumnFamilyHandle() with all the handles. static Status Open(const DBOptions& db_options, const std::string& name, const std::vector& column_families, std::vector* handles, std::unique_ptr* dbptr); - // DEPRECATED: raw pointer variant - static Status Open(const DBOptions& db_options, const std::string& name, - const std::vector& column_families, - std::vector* handles, DB** dbptr) { - std::unique_ptr smart_ptr; - Status s = Open(db_options, name, column_families, handles, &smart_ptr); - *dbptr = smart_ptr.release(); - return s; - } // OpenForReadOnly() creates a Read-only instance that supports reads alone. // @@ -234,16 +175,6 @@ class DB { static Status OpenForReadOnly(const Options& options, const std::string& name, std::unique_ptr* dbptr, bool error_if_wal_file_exists = false); - // DEPRECATED: raw pointer variant - static Status OpenForReadOnly(const Options& options, const std::string& name, - DB** dbptr, - bool error_if_wal_file_exists = false) { - std::unique_ptr smart_ptr; - Status s = - OpenForReadOnly(options, name, &smart_ptr, error_if_wal_file_exists); - *dbptr = smart_ptr.release(); - return s; - } // Open the database for read only with column families. // @@ -257,18 +188,6 @@ class DB { const std::vector& column_families, std::vector* handles, std::unique_ptr* dbptr, bool error_if_wal_file_exists = false); - // DEPRECATED: raw pointer variant - static Status OpenForReadOnly( - const DBOptions& db_options, const std::string& name, - const std::vector& column_families, - std::vector* handles, DB** dbptr, - bool error_if_wal_file_exists = false) { - std::unique_ptr smart_ptr; - Status s = OpenForReadOnly(db_options, name, column_families, handles, - &smart_ptr, error_if_wal_file_exists); - *dbptr = smart_ptr.release(); - return s; - } // OpenAsSecondary() creates a secondary instance that supports read-only // operations and supports dynamic catch up with the primary (through a @@ -290,8 +209,6 @@ class DB { // The secondary_path argument points to a directory where the secondary // instance stores its info log. // The dbptr is an out-arg corresponding to the opened secondary instance. - // The pointer points to a heap-allocated database, and the caller should - // delete it after use. // // Return OK on success, non-OK on failures. // @@ -304,14 +221,6 @@ class DB { static Status OpenAsSecondary(const Options& options, const std::string& name, const std::string& secondary_path, std::unique_ptr* dbptr); - // DEPRECATED: raw pointer variant - static Status OpenAsSecondary(const Options& options, const std::string& name, - const std::string& secondary_path, DB** dbptr) { - std::unique_ptr smart_ptr; - Status s = OpenAsSecondary(options, name, secondary_path, &smart_ptr); - *dbptr = smart_ptr.release(); - return s; - } // Open DB as secondary instance with specified column families // @@ -340,9 +249,8 @@ class DB { // The handles is an out-arg corresponding to the opened database column // family handles. // The dbptr is an out-arg corresponding to the opened secondary instance. - // The pointer points to a heap-allocated database, and the caller should - // delete it after use. Before deleting the dbptr, the user should also - // delete the pointers stored in handles vector. + // Before destroying the DB, the user should call + // DestroyColumnFamilyHandle() on all the handles. // // Return OK on success, non-OK on failures. static Status OpenAsSecondary( @@ -350,18 +258,6 @@ class DB { const std::string& secondary_path, const std::vector& column_families, std::vector* handles, std::unique_ptr* dbptr); - // DEPRECATED: raw pointer variant - static Status OpenAsSecondary( - const DBOptions& db_options, const std::string& name, - const std::string& secondary_path, - const std::vector& column_families, - std::vector* handles, DB** dbptr) { - std::unique_ptr smart_ptr; - Status s = OpenAsSecondary(db_options, name, secondary_path, - column_families, handles, &smart_ptr); - *dbptr = smart_ptr.release(); - return s; - } // EXPERIMENTAL @@ -386,16 +282,30 @@ class DB { std::vector* handles, std::unique_ptr* dbptr); // End EXPERIMENTAL - // Open DB and run the compaction. - // It's a read-only operation, the result won't be installed to the DB, it - // will be output to the `output_directory`. The API should only be used with - // `options.CompactionService` to run compaction triggered by - // `CompactionService`. static Status OpenAndCompact( const std::string& name, const std::string& output_directory, const std::string& input, std::string* output, const CompactionServiceOptionsOverride& override_options); + // Opens a database and runs compaction without modifying the original DB. + // + // This read-only operation outputs compaction results to `output_directory` + // instead of installing them back to the source database. Designed primarily + // for use with `CompactionService` to process remote compaction jobs. + // + // Parameters: + // - `options`: Additional controls + // * When `allow_resumption = false`: The `output_directory` MUST be empty + // before calling this function. Any existing files (including resume + // state or output files from previous runs) in the directory may + // cause correctness errors as the compaction will start from scratch. + // - `name`: Source database path + // - `output_directory`: Where compaction output files are written + // - `input`: Serialized compaction input information + // - `output`: Serialized compaction result + // - `override_options`: Configuration overrides for the operation + // + // Returns: Status of the compaction operation static Status OpenAndCompact( const OpenAndCompactOptions& options, const std::string& name, const std::string& output_directory, const std::string& input, @@ -414,18 +324,6 @@ class DB { const std::vector& column_families, std::vector* handles, std::unique_ptr* dbptr, std::string trim_ts); - // DEPRECATED: raw pointer variant - static Status OpenAndTrimHistory( - const DBOptions& db_options, const std::string& dbname, - const std::vector& column_families, - std::vector* handles, DB** dbptr, - std::string trim_ts) { - std::unique_ptr smart_ptr; - Status s = OpenAndTrimHistory(db_options, dbname, column_families, handles, - &smart_ptr, trim_ts); - *dbptr = smart_ptr.release(); - return s; - } // Manually, synchronously attempt to resume DB writes after a write failure // to the underlying filesystem. See @@ -653,7 +551,7 @@ class DB { const Slice& /*key*/, const Slice& /*ts*/, const Slice& /*value*/); - // Apply the specified updates to the database. + // Apply the specified updates atomically to the database. // If `updates` contains no update, WAL will still be synced if // options.sync=true. // Returns OK on success, non-OK on failure. @@ -669,6 +567,21 @@ class DB { "WriteWithCallback not implemented for this interface."); } + // EXPERIMENTAL, subject to change + // Ingest a WriteBatchWithIndex into DB, bypassing memtable writes for better + // write performance. Useful when there is a large number of updates + // in the write batch. + // The WriteBatchWithIndex must be created with overwrite_key=true. + // Currently this requires WriteOptions::disableWAL=true. + // The following options are currently not supported: + // - unordered_write + // - enable_pipelined_write + virtual Status IngestWriteBatchWithIndex( + const WriteOptions& /*options*/, + std::shared_ptr /*wbwi*/) { + return Status::NotSupported("IngestWriteBatchWithIndex not implemented."); + } + // If the column family specified by "column_family" contains an entry for // "key", return the corresponding value in "*value". If the entry is a plain // key-value, return the value as-is; if it is a wide-column entity, return @@ -1073,7 +986,7 @@ class DB { // call one of the Seek methods on the iterator before using it). // // Caller should delete the iterator when it is no longer needed. - // The returned iterator should be deleted before this db is deleted. + // The returned iterator should be deleted before this db is destroyed. virtual Iterator* NewIterator(const ReadOptions& options, ColumnFamilyHandle* column_family) = 0; virtual Iterator* NewIterator(const ReadOptions& options) { @@ -1081,7 +994,7 @@ class DB { } // Returns iterators from a consistent database state across multiple // column families. Iterators are heap allocated and need to be deleted - // before the db is deleted + // before the db is destroyed virtual Status NewIterators( const ReadOptions& options, const std::vector& column_families, @@ -1110,6 +1023,44 @@ class DB { const ReadOptions& options, const std::vector& column_families) = 0; + // Get an iterator that scans multiple key ranges. The scan ranges should + // be in increasing order of start key. See multi_scan_iterator.h for more + // details. For optimal performance, ensure that either all entries in + // scan_opts specify the range limit, or none of them do. + // + // NOTE: NOT YET SUPPORTED in DBs using user timestamp (see + // Comparator::timestamp_size()) + // + // NOTE: iterate_upper_bound in ReadOptions will + // be ignored. Instead, the range.limit in ScanOptions is consulted to + // determine the upper bound key, if specified. + // + // Example usage - + // std::vector scans{{.start = Slice("bar")}, + // {.start = Slice("foo")}}; + // std::unique_ptr iter.reset( + // db->NewMultiScan()); + // try { + // for (auto scan : *iter) { + // for (auto it : scan) { + // // Do something with key - it.first + // // Do something with value - it.second + // } + // } + // } catch (MultiScanException& ex) { + // // Check ex.status() + // } catch (std::logic_error& ex) { + // // Check ex.what() + // } + virtual std::unique_ptr NewMultiScan( + const ReadOptions& /*options*/, ColumnFamilyHandle* column_family, + const MultiScanArgs& /*scan_opts*/) { + std::unique_ptr iter(NewErrorIterator(Status::NotSupported())); + std::unique_ptr ms_iter = std::make_unique( + column_family->GetComparator(), std::move(iter)); + return ms_iter; + } + // Return a handle to the current DB state. Iterators created with // this handle will all observe a stable snapshot of the current DB // state. The caller must call ReleaseSnapshot(result) when the @@ -1225,6 +1176,10 @@ class DB { // sorted runs being processed by currently running compactions. static const std::string kNumRunningCompactionSortedRuns; + // "rocksdb.compaction-abort-count" - returns the current value of the + // compaction abort counter. + static const std::string kCompactionAbortCount; + // "rocksdb.background-errors" - returns accumulated number of background // errors. static const std::string kBackgroundErrors; @@ -1600,15 +1555,39 @@ class DB { // s = db->SetOptions(cfh, {{"block_based_table_factory", // "{prepopulate_block_cache=kDisable;}"}}); virtual Status SetOptions( - ColumnFamilyHandle* /*column_family*/, - const std::unordered_map& /*opts_map*/) { - return Status::NotSupported("Not implemented"); + ColumnFamilyHandle* column_family, + const std::unordered_map& opts_map) { + return SetOptions(std::vector{column_family}, + opts_map); } // Shortcut for SetOptions on the default column family handle. virtual Status SetOptions( const std::unordered_map& new_options) { return SetOptions(DefaultColumnFamily(), new_options); } + // Shortcut where you want to apply the same options to multiple column + // families. Beneficial for avoiding reserialization of OPTIONS file. + virtual Status SetOptions( + const std::vector& column_families, + const std::unordered_map& opts_map) { + std::unordered_map> + column_families_opts_map; + column_families_opts_map.reserve(column_families.size()); + for (auto* cf : column_families) { + column_families_opts_map[cf] = opts_map; + } + return SetOptions(column_families_opts_map); + } + // SetOptions with potentially different options per column family. It is + // typically better to batch all option changes together as the OPTIONS file + // is written to once per SetOptions call. + virtual Status SetOptions( + const std::unordered_map>& + /*column_families_opts_map*/) { + return Status::NotSupported("Not implemented"); + } // Like SetOptions but for DBOptions, including the same caveats for // usability, reliability, and performance. See GetDBOptionsFromMap() (and @@ -1679,6 +1658,46 @@ class DB { // DisableManualCompaction() has been called. virtual void EnableManualCompaction() = 0; + // Abort all compaction work/jobs. This function will signal all + // running compactions (both automatic and manual, background and foreground) + // to abort and will wait for them to finish or abort before returning. After + // this function returns, new compaction work will be aborted immediately + // until ResumeAllCompactions() is called. + // + // The compaction abort is checked periodically (every 1000 keys processed), + // so ongoing compactions should abort as well within a reasonable time. + // This function blocks until all compactions have completed or aborted. + // + // Any output files from aborted compactions are automatically cleaned up, + // ensuring no partial compaction results are installed, except for resumable + // compaction. + // + // This function supports concurrent abort requests from multiple callers + // without coordination between them. The call count is tracked, and + // compactions only resume after the number of ResumeAllCompactions() calls + // matches number of AbortAllCompactions() calls. + // + // Differences with other compaction control APIs: + // - DisableManualCompaction(): Only pauses manual compactions, waits for + // them to finish naturally. AbortAllCompactions() actively cancels both + // automatic and manual compactions. + // - PauseBackgroundWork(): Pauses all background work (flush + compaction), + // waits for work to finish naturally. AbortAllCompactions() only affects + // compactions and actively cancels them. + // + // Note: Compaction service (remote compaction) is not currently supported. + // Aborted compactions return Status::Incomplete with subcode + // kCompactionAborted. + virtual void AbortAllCompactions() = 0; + + // Resume all compactions that were aborted by AbortAllCompactions(). + // This function must be called as many times as AbortAllCompactions() + // has been called in order to resume compactions. This reference-counting + // behavior ensures that if multiple callers independently request an + // abort, compactions will not resume until all of them have called + // ResumeAllCompactions(). + virtual void ResumeAllCompactions() = 0; + // Wait for all flush and compactions jobs to finish. Jobs to wait include the // unscheduled (queued, but not scheduled yet). If the db is shutting down, // Status::ShutdownInProgress will be returned. @@ -1695,13 +1714,6 @@ class DB { virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } - // Maximum level to which a new compacted memtable is pushed if it - // does not create overlap. - virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0; - virtual int MaxMemCompactionLevel() { - return MaxMemCompactionLevel(DefaultColumnFamily()); - } - // Number of files in level-0 that would stop writes. virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0; virtual int Level0StopWriteTrigger() { @@ -1758,6 +1770,10 @@ class DB { return Status::NotSupported("FlushWAL not implemented"); } + virtual Status FlushWAL(const FlushWALOptions& /*options*/) { + return Status::NotSupported("FlushWAL not implemented"); + } + // Ensure all WAL writes have been synced to storage, so that (assuming OS // and hardware support) data will survive power loss. This function does // not imply FlushWAL, so `FlushWAL(true)` is recommended if using @@ -1803,6 +1819,25 @@ class DB { virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, std::string* ts_low) = 0; + // EXPERIMENTAL + // Get the newest timestamp of the column family. This is only for when the + // column family enables user defined timestamp and when timestamps are not + // persisted in SST files, a.k.a `persist_user_defined_timestamps=false`. + // This checks the mutable memtable, the immutable memtable and the SST files, + // and returns the first newest user defined timestamp found. + // When user defined timestamp is not persisted in SST files, metadata in + // MANIFEST tracks the most recently seen timestamp for SST files, so the + // newest timestamp in SST files can be found. + // OK status is returned if finding the newest timestamp succeeds, if + // `newest_timestamp` is empty, it means the column family hasn't seen any + // timestamp. The returned timestamp is encoded, util method `DecodeU64Ts` can + // be used to decode it into uint64_t. + // User-defined timestamp is required to be increasing per key, the return + // value of this API would be most useful if the user-defined timestamp is + // monotonically increasing across keys. + virtual Status GetNewestUserDefinedTimestamp( + ColumnFamilyHandle* column_family, std::string* newest_timestamp) = 0; + // Suspend deleting obsolete files. Compactions will continue to occur, // but no obsolete files will be deleted. To resume file deletions, each // call to DisableFileDeletions() must be matched by a subsequent call to @@ -1878,11 +1913,24 @@ class DB { virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, ColumnFamilyMetaData* /*metadata*/) {} + // Obtains the LSM-tree meta data of the specified column family of the DB + // with optional filtering by key range and level. + virtual void GetColumnFamilyMetaData( + ColumnFamilyHandle* /*column_family*/, + const GetColumnFamilyMetaDataOptions& /*options*/, + ColumnFamilyMetaData* /*metadata*/) {} + // Get the metadata of the default column family. void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) { GetColumnFamilyMetaData(DefaultColumnFamily(), metadata); } + // Get the metadata of the default column family with optional filtering. + void GetColumnFamilyMetaData(const GetColumnFamilyMetaDataOptions& options, + ColumnFamilyMetaData* metadata) { + GetColumnFamilyMetaData(DefaultColumnFamily(), options, metadata); + } + // Obtains the LSM-tree meta data of all column families of the DB, including // metadata for each live table (SST) file and each blob file in the DB. virtual void GetAllColumnFamilyMetaData( @@ -1914,12 +1962,12 @@ class DB { // Retrieve information about the current wal file // // Note that the log might have rolled after this call in which case - // the current_log_file would not point to the current log file. + // the current_wal_file would not point to the current log file. // - // Additionally, for the sake of optimization current_log_file->StartSequence + // Additionally, for the sake of optimization current_wal_file->StartSequence // would always be set to 0 virtual Status GetCurrentWalFile( - std::unique_ptr* current_log_file) = 0; + std::unique_ptr* current_wal_file) = 0; // IngestExternalFile() will load a list of external SST files (1) into the DB // Two primary modes are supported: @@ -1928,7 +1976,9 @@ class DB { // In the first mode we will try to find the lowest possible level that // the file can fit in, and ingest the file into this level (2). A file that // have a key range that overlap with the memtable key range will require us - // to Flush the memtable first before ingesting the file. + // to Flush the memtable first before ingesting the file. If ingested files + // have any overlap with each other, level and sequence number assignment + // ensure later files overwrite earlier files. // In the second mode we will always ingest in the bottom most level (see // docs to IngestExternalFileOptions::ingest_behind). // For a column family that enables user-defined timestamps, ingesting @@ -1946,7 +1996,7 @@ class DB { // even if the file compression doesn't match the level compression // (3) If IngestExternalFileOptions->ingest_behind is set to true, // we always ingest at the bottommost level, which should be reserved - // for this purpose (see DBOPtions::allow_ingest_behind flag). + // for this purpose (see ColumnFamilyOptions::cf_allow_ingest_behind). // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to // true, then this method can return Status:TryAgain() indicating that // the files cannot be ingested to the bottommost level, and it is the @@ -2081,14 +2131,11 @@ class DB { ColumnFamilyHandle* column_family, const Range* range, std::size_t n, TablePropertiesCollection* props) = 0; - // Get the table properties of files per level. - virtual Status GetPropertiesOfTablesForLevels( - ColumnFamilyHandle* /* column_family */, - std::vector< - std::unique_ptr>* /* levels_props */) { - return Status::NotSupported( - "GetPropertiesOfTablesForLevels() is not implemented."); - } + // Get the table properties of files by level. + virtual Status GetPropertiesOfTablesByLevel( + ColumnFamilyHandle* column_family, + std::vector>* + props_by_level) = 0; virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/, const Slice* /*begin*/, @@ -2210,12 +2257,9 @@ inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family, uint64_t* sizes, SizeApproximationFlags include_flags) { SizeApproximationOptions options; - options.include_memtables = - ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != - SizeApproximationFlags::NONE); - options.include_files = - ((include_flags & SizeApproximationFlags::INCLUDE_FILES) != - SizeApproximationFlags::NONE); + using enum SizeApproximationFlags; // Require C++20 support + options.include_memtables = ((include_flags & INCLUDE_MEMTABLES) != NONE); + options.include_files = ((include_flags & INCLUDE_FILES) != NONE); return GetApproximateSizes(options, column_family, ranges, n, sizes); } diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 0d5f24b52683..6dbfa7537bac 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -39,7 +39,7 @@ #undef LoadLibrary #endif -#if defined(__GNUC__) || defined(__clang__) +#if defined(__GNUC__) || defined(__clang__) // ODR-SAFE (essentially) #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \ __attribute__((__format__(__printf__, format_param, dots_param))) #else @@ -455,10 +455,146 @@ class Env : public Customizable { kVerifyFileChecksums = 7, kGetEntity = 8, kMultiGetEntity = 9, - kReadManifest = 10, - kUnknown, // Keep last for easy array of non-unknowns + kGetFileChecksumsFromCurrentManifest = 10, + // Enums after this, up to 0x7F, are reserved for future use for the public + // RocksDB API (i.e. they should be "non-custom" IO activities). Make sure + // to also update IOActivityToString when adding new values. + + kCustomIOActivity80 = 0x80, + kFirstCustomIOActivity = kCustomIOActivity80, + kCustomIOActivity81 = 0x81, + kCustomIOActivity82 = 0x82, + kCustomIOActivity83 = 0x83, + kCustomIOActivity84 = 0x84, + kCustomIOActivity85 = 0x85, + kCustomIOActivity86 = 0x86, + kCustomIOActivity87 = 0x87, + kCustomIOActivity88 = 0x88, + kCustomIOActivity89 = 0x89, + kCustomIOActivity8A = 0x8A, + kCustomIOActivity8B = 0x8B, + kCustomIOActivity8C = 0x8C, + kCustomIOActivity8D = 0x8D, + kCustomIOActivity8E = 0x8E, + kCustomIOActivity8F = 0x8F, + kCustomIOActivity90 = 0x90, + kCustomIOActivity91 = 0x91, + kCustomIOActivity92 = 0x92, + kCustomIOActivity93 = 0x93, + kCustomIOActivity94 = 0x94, + kCustomIOActivity95 = 0x95, + kCustomIOActivity96 = 0x96, + kCustomIOActivity97 = 0x97, + kCustomIOActivity98 = 0x98, + kCustomIOActivity99 = 0x99, + kCustomIOActivity9A = 0x9A, + kCustomIOActivity9B = 0x9B, + kCustomIOActivity9C = 0x9C, + kCustomIOActivity9D = 0x9D, + kCustomIOActivity9E = 0x9E, + kCustomIOActivity9F = 0x9F, + kCustomIOActivityA0 = 0xA0, + kCustomIOActivityA1 = 0xA1, + kCustomIOActivityA2 = 0xA2, + kCustomIOActivityA3 = 0xA3, + kCustomIOActivityA4 = 0xA4, + kCustomIOActivityA5 = 0xA5, + kCustomIOActivityA6 = 0xA6, + kCustomIOActivityA7 = 0xA7, + kCustomIOActivityA8 = 0xA8, + kCustomIOActivityA9 = 0xA9, + kCustomIOActivityAA = 0xAA, + kCustomIOActivityAB = 0xAB, + kCustomIOActivityAC = 0xAC, + kCustomIOActivityAD = 0xAD, + kCustomIOActivityAE = 0xAE, + kCustomIOActivityAF = 0xAF, + kCustomIOActivityB0 = 0xB0, + kCustomIOActivityB1 = 0xB1, + kCustomIOActivityB2 = 0xB2, + kCustomIOActivityB3 = 0xB3, + kCustomIOActivityB4 = 0xB4, + kCustomIOActivityB5 = 0xB5, + kCustomIOActivityB6 = 0xB6, + kCustomIOActivityB7 = 0xB7, + kCustomIOActivityB8 = 0xB8, + kCustomIOActivityB9 = 0xB9, + kCustomIOActivityBA = 0xBA, + kCustomIOActivityBB = 0xBB, + kCustomIOActivityBC = 0xBC, + kCustomIOActivityBD = 0xBD, + kCustomIOActivityBE = 0xBE, + kCustomIOActivityBF = 0xBF, + kCustomIOActivityC0 = 0xC0, + kCustomIOActivityC1 = 0xC1, + kCustomIOActivityC2 = 0xC2, + kCustomIOActivityC3 = 0xC3, + kCustomIOActivityC4 = 0xC4, + kCustomIOActivityC5 = 0xC5, + kCustomIOActivityC6 = 0xC6, + kCustomIOActivityC7 = 0xC7, + kCustomIOActivityC8 = 0xC8, + kCustomIOActivityC9 = 0xC9, + kCustomIOActivityCA = 0xCA, + kCustomIOActivityCB = 0xCB, + kCustomIOActivityCC = 0xCC, + kCustomIOActivityCD = 0xCD, + kCustomIOActivityCE = 0xCE, + kCustomIOActivityCF = 0xCF, + kCustomIOActivityD0 = 0xD0, + kCustomIOActivityD1 = 0xD1, + kCustomIOActivityD2 = 0xD2, + kCustomIOActivityD3 = 0xD3, + kCustomIOActivityD4 = 0xD4, + kCustomIOActivityD5 = 0xD5, + kCustomIOActivityD6 = 0xD6, + kCustomIOActivityD7 = 0xD7, + kCustomIOActivityD8 = 0xD8, + kCustomIOActivityD9 = 0xD9, + kCustomIOActivityDA = 0xDA, + kCustomIOActivityDB = 0xDB, + kCustomIOActivityDC = 0xDC, + kCustomIOActivityDD = 0xDD, + kCustomIOActivityDE = 0xDE, + kCustomIOActivityDF = 0xDF, + kCustomIOActivityE0 = 0xE0, + kCustomIOActivityE1 = 0xE1, + kCustomIOActivityE2 = 0xE2, + kCustomIOActivityE3 = 0xE3, + kCustomIOActivityE4 = 0xE4, + kCustomIOActivityE5 = 0xE5, + kCustomIOActivityE6 = 0xE6, + kCustomIOActivityE7 = 0xE7, + kCustomIOActivityE8 = 0xE8, + kCustomIOActivityE9 = 0xE9, + kCustomIOActivityEA = 0xEA, + kCustomIOActivityEB = 0xEB, + kCustomIOActivityEC = 0xEC, + kCustomIOActivityED = 0xED, + kCustomIOActivityEE = 0xEE, + kCustomIOActivityEF = 0xEF, + kCustomIOActivityF0 = 0xF0, + kCustomIOActivityF1 = 0xF1, + kCustomIOActivityF2 = 0xF2, + kCustomIOActivityF3 = 0xF3, + kCustomIOActivityF4 = 0xF4, + kCustomIOActivityF5 = 0xF5, + kCustomIOActivityF6 = 0xF6, + kCustomIOActivityF7 = 0xF7, + kCustomIOActivityF8 = 0xF8, + kCustomIOActivityF9 = 0xF9, + kCustomIOActivityFA = 0xFA, + kCustomIOActivityFB = 0xFB, + kCustomIOActivityFC = 0xFC, + kCustomIOActivityFD = 0xFD, + kCustomIOActivityFE = 0xFE, + kLastCustomIOActivity = kCustomIOActivityFE, + + kUnknown = 0xFF, // Keep last as unknown }; + static std::string IOActivityToString(IOActivity activity); + // Arrange to run "(*function)(arg)" once in a background thread, in // the thread pool specified by pri. By default, jobs go to the 'LOW' // priority thread pool. @@ -864,6 +1000,13 @@ class RandomAccessFile { "RandomAccessFile::InvalidateCache not supported."); } + // The default implementation returns "not supported" so that user + // implementations of FSRandomAccessFile do not need to immediately implement + // this function. + virtual Status GetFileSize(uint64_t* /*result*/) { + return Status::NotSupported("RandomAccessFile::GetFileSize not supported."); + } + // If you're adding methods here, remember to add them to // RandomAccessFileWrapper too. }; @@ -1748,6 +1891,9 @@ class RandomAccessFileWrapper : public RandomAccessFile { Status InvalidateCache(size_t offset, size_t length) override { return target_->InvalidateCache(offset, length); } + Status GetFileSize(uint64_t* file_size) override { + return target_->GetFileSize(file_size); + } private: RandomAccessFile* target_; diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index 6b4a13e039b6..118e8a052231 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -240,6 +240,15 @@ class EncryptedRandomAccessFile : public FSRandomAccessFile { size_t GetRequiredBufferAlignment() const override; IOStatus InvalidateCache(size_t offset, size_t length) override; + + // Intentionally leave GetFileSize not overridden here, so that it inherits + // the default implementation from its parent class, which is Not Supported. + // + // As GetFileSize API is not required to be implemented yet, we use encrypted + // file system in unit test to validate the rest of the system could continue + // working with the Not Supported behavior. + // + // IOStatus GetFileSize(uint64_t* /*result*/) override; }; class EncryptedWritableFile : public FSWritableFile { diff --git a/include/rocksdb/experimental.h b/include/rocksdb/experimental.h index 349d05f9b403..42b40cfa4754 100644 --- a/include/rocksdb/experimental.h +++ b/include/rocksdb/experimental.h @@ -21,6 +21,11 @@ Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end); Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end); +// DEPRECATED: this API may be removed in a future release. +// This operation can be done through CompactRange() by setting +// CompactRangeOptions::bottommost_level_compaction set to +// BottommostLevelCompaction::kSkip and setting target level. +// // Move all L0 files to target_level skipping compaction. // This operation succeeds only if the files in L0 have disjoint ranges; this // is guaranteed to happen, for instance, if keys are inserted in sorted @@ -81,7 +86,7 @@ Status UpdateManifestForFilesState( // keys in a category to return an empty sequence of segments. // // To eliminate a confusing distinction between a segment that is empty vs. -// "not present" for a particular key, each key is logically assiciated with +// "not present" for a particular key, each key is logically associated with // an infinite sequence of segments, including some infinite tail of 0-length // segments. In practice, we only represent a finite sequence that (at least) // covers the non-trivial segments. @@ -215,7 +220,7 @@ Status UpdateManifestForFilesState( // whole key. // * Range query - Whether there {definitely isn't, might be} any entries // within a lower and upper key bound, in an SST file (or partition, etc.). -// NOTE: For this disucssion, we ignore the detail of inclusive vs. +// NOTE: For this discussion, we ignore the detail of inclusive vs. // exclusive bounds by assuming a generalized notion of "bound" (vs. key) // that conveniently represents spaces between keys. For details, see // https://github.com/facebook/rocksdb/pull/11434 @@ -295,7 +300,7 @@ Status UpdateManifestForFilesState( // * Keys x and z are in categories in category set s, and // * Key y is ordered x < y < z according to the CF comparator, // then both -// * The common segment prefix property is satisifed through ordinal i-1 +// * The common segment prefix property is satisfied through ordinal i-1 // and with category set s // * x_i..j <= y_i..j <= z_i..j according to segment comparator c, where // x_i..j is the concatenation of segments i through j of key x (etc.). diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h new file mode 100644 index 000000000000..844ba9d96b85 --- /dev/null +++ b/include/rocksdb/external_table.h @@ -0,0 +1,275 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/advanced_iterator.h" +#include "rocksdb/customizable.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/file_system.h" +#include "rocksdb/iterator_base.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class ExternalTableFactory; + +// EXPERIMENTAL +// The interface defined in this file is subject to change at any time without +// warning!! + +// This file defines an interface for plugging in an external table +// into RocksDB. The external table reader will be used instead of the +// BlockBasedTable to load and query sst files. +// The external table files can be created using an SstFileWriter. Eventually +// external tables will be allowed to be ingested into a RocksDB instance +// using the IngestExternalFIle() API. +// +// Initial support is for writing and querying the files using an +// SstFileWriter and SstFileReader. We will add support for ingestion of an +// external table into a limited RocksDB instance that only supports ingestion +// and not live writes in the near future. It'll be followed by support for +// replacing the column family by ingesting a new set of files. In all cases, +// the external table files will only be allowed in the bottommost level. +// +// The external table can support one or both of the following layouts - +// 1. Total order seek - All the keys in the files are in sorted order, and a +// user can seek to the first, last, or any key in between and iterate +// forwards or backwards till the end of the range. To support this mode, +// the implementation needs to use the comparator passed in +// ExternalTableOptions to enforce the key ordering. The prefix_extractor +// in ExternalTableOptions and the ExternalTableReader interfaces can be +// ignored. +// 2. Prefix seek - In this mode, the prefix_extractor is used to extract the +// prefix from a key. All the keys sharing the same prefix are ordered in +// ascending order according to the comparator. However, no specific +// ordering is required across prefixes. Users can scan keys by seeking +// to a specific key inside a prefix, and iterate forwards or backwards +// within the prefix. The prefix_same_as_start flag in ReadOptions will +// be true. +// 3. Both - If supporting both of the above, a user can seek inside a prefix +// and iterate beyond the prefix. The prefix_same_as_start in ReadOptions +// will be false. Additionally, the total_order_seek flag can be set to +// true to seek to the first non-empty prefix (as determined by the key +// order) if the seek prefix is empty. +// +// Many of the options in ReadOptions and WriteOptions may not be relevant to +// the external table implementation. +// TODO: Specify which options are relevant + +class ExternalTableIterator : public IteratorBase { + public: + virtual ~ExternalTableIterator() {} + + // This can optionally be called to prepare the iterator for a series + // of scans. The scan_opts parameter specifies the order of scans to + // follow, as well as the limits for those scans. After calling this, + // the caller will Seek() the iterator to successive start keys in scan_opts. + // + // If Prepare() is called again with a different scan_opts pointer, it + // means the iterator will be reused for a new multi scan. If scan_opts + // is null, then the previous Prepare() can be discarded. + // + // The caller guarantees the lifetime of scan_opts until its either cleared + // or replaced by another Prepare(). + // TODO: Update the contract to trim the scan_opts range to only include + // scans that potentially intersect the file key range. + // + // If the sequence of Seeks is interrupted by seeking to some other target + // key, then the iterator is free to discard anything done during Prepare. + virtual void Prepare(const ScanOptions scan_opts[], size_t num_opts) = 0; + + // Similar to Next(), except it also fills the result and returns whether + // the iterator is on a valid key or not + virtual bool NextAndGetResult(IterateResult* result) = 0; + + // Prepares the value if its lazily materialized. The implementation can + // request that this be called by setting value_prepared to false in + // IterateResult. Next() should always implicitly materialize the + // value. + bool PrepareValue() override = 0; + + // Return the current key's value + virtual Slice value() const = 0; + + // Return the current position bounds check result - kInbound if the + // position is a valid key, kOutOfBound if the key is out of bound (i.e + // scan has terminated), or kUnknown if end of file. + virtual IterBoundCheck UpperBoundCheckResult() = 0; +}; + +class ExternalTableReader { + public: + virtual ~ExternalTableReader() {} + + // Return an Iterator that can be used to scan the table file. + // The read_options can optionally contain the upper bound + // key (exclusive) of the scan in iterate_upper_bound. + virtual ExternalTableIterator* NewIterator( + const ReadOptions& read_options, + const SliceTransform* prefix_extractor) = 0; + + // Point lookup the given key and return its value + virtual Status Get(const ReadOptions& read_options, const Slice& key, + const SliceTransform* prefix_extractor, + std::string* value) = 0; + + // Point lookup the given vector of keys and return the values, as well + // as status of each individual lookup in statuses. + virtual void MultiGet(const ReadOptions& read_options, + const std::vector& keys, + const SliceTransform* prefix_extractor, + std::vector* values, + std::vector* statuses) = 0; + + // Allocate and return the contents of the properties block. If the builder + // supports PutPropertiesBlock(), then this must be supported. The + // properties block should be written to the table file as is (no + // compression or mutation of any kind), and its offset in the file + // should be returned in file_offset. + virtual Status GetPropertiesBlock(std::unique_ptr* /*property_block*/, + uint64_t* /*size*/, + uint64_t* /*file_offset*/) { + return Status::NotSupported(); + } + + // Return TableProperties for the file. At a minimum, the following + // properties need to be returned - + // comparator_name + // num_entries + // raw_key_size + // raw_value_size + virtual std::shared_ptr GetTableProperties() const = 0; + + virtual Status VerifyChecksum(const ReadOptions& /*ro*/) { + return Status::NotSupported("VerifyChecksum() not supported"); + } +}; + +// A table builder interface that can be used by SstFileWriter to allow +// RocksDB users to write external table files. The sequence of operations +// to write an external table is as follows - +// 1. Add() is called one or more times to write all key-values to the table. +// Its called in increasing key order, as determined by the comparator. +// The input key is a user key, i.e sequence number and value type are +// stripped out. +// 2. After every Add() operation, status() is called to check the current +// status. +// 3. After the last key is added, Finish() is called to do whatever is +// necessary to ensure the data is persisted in the table file. +// 4. If there is a failure midway for some reason, Abandon() is called +// instead of Finish(). +// 5. At the end, FileSize(), GetTableProperties(), and status() are called to +// get the final size of the file, the table properties, and the final +// status. GetFileChecksum() and GetFileChecksumFuncName() may also be +// called to get checksum information about the whole file, but their +// implementation is optional. +class ExternalTableBuilder { + public: + virtual ~ExternalTableBuilder() {} + + // Write a single KV to the table file. This is guaranteed to be called + // in key order, and the write may be buffered and flushed at a later time. + virtual void Add(const Slice& key, const Slice& value) = 0; + + // Return the current Status. This could return non-ok, for example, if + // Add() fails for some reason. + virtual Status status() const = 0; + + // Flush and close the table file + virtual Status Finish() = 0; + + // Delete the partial file and release any allocated resources. Either this + // or Finish() will be called, but not both. + virtual void Abandon() = 0; + + // Return the size of the table file. Will be called at the end, after + // Finish(). + virtual uint64_t FileSize() const = 0; + + // Write the raw properties block as is in the table file + virtual Status PutPropertiesBlock(const Slice& /*property_block*/) { + return Status::NotSupported(); + } + + // As mentioned in earlier comments, the following table properties must be + // returned at a minimum - + // comparator_name + // num_entries + // raw_key_size + // raw_value_size + virtual TableProperties GetTableProperties() const = 0; + + virtual std::string GetFileChecksum() const { return kUnknownFileChecksum; } + + virtual const char* GetFileChecksumFuncName() const { + return kUnknownFileChecksumFuncName; + } +}; + +struct ExternalTableOptions { + const std::shared_ptr& prefix_extractor; + const Comparator* comparator; + const std::shared_ptr& fs; + const FileOptions& file_options; + + ExternalTableOptions( + const std::shared_ptr& _prefix_extractor, + const Comparator* _comparator, const std::shared_ptr& _fs, + const FileOptions& _file_options) + : prefix_extractor(_prefix_extractor), + comparator(_comparator), + fs(_fs), + file_options(_file_options) {} +}; + +struct ExternalTableBuilderOptions { + const ReadOptions& read_options; + const WriteOptions& write_options; + const std::shared_ptr& prefix_extractor; + const Comparator* comparator; + const std::string& column_family_name; + const std::string db_id; + const std::string db_session_id; + const TableFileCreationReason reason; + + ExternalTableBuilderOptions( + const ReadOptions& _read_options, const WriteOptions& _write_options, + const std::shared_ptr& _prefix_extractor, + const Comparator* _comparator, const std::string& _column_family_name, + const TableFileCreationReason _reason) + : read_options(_read_options), + write_options(_write_options), + prefix_extractor(_prefix_extractor), + comparator(_comparator), + column_family_name(_column_family_name), + reason(_reason) {} +}; + +class ExternalTableFactory : public Customizable { + public: + ~ExternalTableFactory() override {} + + const char* Name() const override { return "ExternalTableFactory"; } + + virtual Status NewTableReader( + const ReadOptions& read_options, const std::string& file_path, + const ExternalTableOptions& table_options, + std::unique_ptr* table_reader) const = 0; + + // The table builder should use the file pointer to append to the file. + // Do not sync or close the file after finishing. RocksDB will do that. + virtual ExternalTableBuilder* NewTableBuilder( + const ExternalTableBuilderOptions& builder_options, + const std::string& file_path, FSWritableFile* file) const = 0; +}; + +// Allocate a TableFactory that wraps around an ExternalTableFactory. Use this +// to allocate and set in ColumnFamilyOptions::table_factory. +std::unique_ptr NewExternalTableFactory( + std::shared_ptr inner_factory); + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/external_table_reader.h b/include/rocksdb/external_table_reader.h deleted file mode 100644 index 9bba9f4f3eff..000000000000 --- a/include/rocksdb/external_table_reader.h +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once - -#include "rocksdb/customizable.h" -#include "rocksdb/iterator.h" -#include "rocksdb/options.h" -#include "rocksdb/status.h" - -namespace ROCKSDB_NAMESPACE { - -class ExternalTableFactory; - -// EXPERIMENTAL -// The interface defined in this file is subject to change at any time without -// warning!! - -// This file defines an interface for plugging in an external table reader -// into RocksDB. The external table reader will be used instead of the -// BlockBasedTable to load and query sst files. As of now, creating the -// external table files using RocksDB is not supported, but will be added in -// the near future. The external table files can be created outside and -// RocksDB and ingested into a RocksDB instance using the IngestExternalFIle() -// API. -// -// Initial support is for loading and querying the files using an -// SstFileReader. We will add support for ingestion of an external table -// into a limited RocksDB instance that only supports ingestion and not live -// writes in the near future. It'll be followed by support for replacing the -// column family by ingesting a new set of files. In all cases, the external -// table files will only be allowed in the bottommost level. -// -// The external table reader can support one or both of the following layouts - -// 1. Total order seek - All the keys in the files are in sorted order, and a -// user can seek to the first, last, or any key in between and iterate -// forwards or backwards till the end of the range. To support this mode, -// the implementation needs to use the comparator passed in -// ExternalTableOptions to enforce the key ordering. The prefix_extractor -// in ExternalTableOptions and the ExternalTableReader interfaces can be -// ignored. -// 2. Prefix seek - In this mode, the prefix_extractor is used to extract the -// prefix from a key. All the keys sharing the same prefix are ordered in -// ascending order according to the comparator. However, no specific -// ordering is required across prefixes. Users can scan keys by seeking -// to a specific key inside a prefix, and iterate forwards or backwards -// within the prefix. The prefix_same_as_start flag in ReadOptions will -// be true. -// 3. Both - If supporting both of the above, a user can seek inside a prefix -// and iterate beyond the prefix. The prefix_same_as_start in ReadOptions -// will be false. Additionally, the total_order_seek flag can be set to -// true to seek to the first non-empty prefix (as determined by the key -// order) if the seek prefix is empty. -// -// Many of the options in ReadOptions may not be relevant to the external -// table implementation. -// TODO: Specify which options are relevant - -class ExternalTableReader { - public: - virtual ~ExternalTableReader() {} - - // Return an Iterator that can be used to scan the table file. - // The read_options can optionally contain the upper bound - // key (exclusive) of the scan in iterate_upper_bound. - virtual Iterator* NewIterator(const ReadOptions& read_options, - const SliceTransform* prefix_extractor) = 0; - - // Point lookup the given key and return its value - virtual Status Get(const ReadOptions& read_options, const Slice& key, - const SliceTransform* prefix_extractor, - std::string* value) = 0; - - // Point lookup the given vector of keys and return the values, as well - // as status of each individual lookup in statuses. - virtual void MultiGet(const ReadOptions& read_options, - const std::vector& keys, - const SliceTransform* prefix_extractor, - std::vector* values, - std::vector* statuses) = 0; - - // Return TableProperties for the file. At a minimum, the following - // properties need to be returned - - // comparator_name - // num_entries - // raw_key_size - // raw_value_size - virtual std::shared_ptr GetTableProperties() const = 0; - - virtual Status VerifyChecksum(const ReadOptions& /*ro*/) { - return Status::NotSupported("VerifyChecksum() not supported"); - } -}; - -struct ExternalTableOptions { - const std::shared_ptr& prefix_extractor; - const Comparator* comparator; - - ExternalTableOptions( - const std::shared_ptr& _prefix_extractor, - const Comparator* _comparator) - : prefix_extractor(_prefix_extractor), comparator(_comparator) {} -}; - -class ExternalTableFactory : public Customizable { - public: - ~ExternalTableFactory() override {} - - const char* Name() const override { return "ExternalTableFactory"; } - - virtual Status NewTableReader( - const ReadOptions& read_options, const std::string& file_path, - const ExternalTableOptions& table_options, - std::unique_ptr* table_reader) = 0; -}; - -// Allocate a TableFactory that wraps around an ExternalTableFactory. Use this -// to allocate and set in ColumnFamilyOptions::table_factory. -std::shared_ptr NewExternalTableFactory( - std::shared_ptr inner_factory); - -} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/file_checksum.h b/include/rocksdb/file_checksum.h index 66024d0a1b4e..70de891f2c05 100644 --- a/include/rocksdb/file_checksum.h +++ b/include/rocksdb/file_checksum.h @@ -22,7 +22,12 @@ namespace ROCKSDB_NAMESPACE { // The unknown file checksum. constexpr char kUnknownFileChecksum[] = ""; // The unknown sst file checksum function name. +// Indicates that the file metadata says that no checksum factory was configured +// when the file was written. constexpr char kUnknownFileChecksumFuncName[] = "Unknown"; +// Used when opening a file and there is no file checksum metadata to propagate +// at all. +constexpr char kNoFileChecksumFuncName[] = "Unavailable"; // The standard DB file checksum function name. // This is the name of the checksum function returned by // GetFileChecksumGenCrc32cFactory(); @@ -80,7 +85,8 @@ class FileChecksumGenFactory : public Customizable { const ConfigOptions& options, const std::string& value, std::shared_ptr* result); - // Create a new FileChecksumGenerator. + // Create a new FileChecksumGenerator. Recommended to return nullptr if the + // requested function name is not recognized. virtual std::unique_ptr CreateFileChecksumGenerator( const FileChecksumGenContext& context) = 0; diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 27e497f432b5..ea9d52bf6b30 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -18,11 +18,13 @@ #include +#include #include #include #include #include #include +#include #include #include #include @@ -88,6 +90,7 @@ enum FSSupportedOps { kVerifyAndReconstructRead, // Supports a higher level of data integrity. See // the verify_and_reconstruct_read flag in // IOOptions. + kFSPrefetch, // Supports prefetch operations }; // Per-request options that can be passed down to the FileSystem @@ -192,6 +195,25 @@ struct FileOptions : EnvOptions { // handoff during file writes. ChecksumType handoff_checksum_type; + // Expose write lifetime hint on the FileOptions level to provide more + // flexibility in setting the hint in downstream, custom implementations + // that might be able to process the hint only at the time of the actual + // FSWritableFile object creation. + Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET; + + // File checksum of the file being opened. Empty string if no checksum is + // available. + std::string file_checksum; + + // Name of the checksum function used to compute file_checksum. Set to + // kUnknownFileChecksumFuncName when file was created without a checksum + // factory. Set to kNoFileChecksumFuncName when no checksum metadata is + // available. + // Production FileSystems will accept empty values for both + // file_checksum and file_checksum_func_name, but internally within RocksDB + // that is forbidden for checking/auditing purposes. + std::string file_checksum_func_name; + FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const DBOptions& opts) @@ -206,13 +228,18 @@ struct FileOptions : EnvOptions { : EnvOptions(opts), io_options(opts.io_options), temperature(opts.temperature), - handoff_checksum_type(opts.handoff_checksum_type) {} + handoff_checksum_type(opts.handoff_checksum_type), + write_hint(opts.write_hint), + file_checksum(opts.file_checksum), + file_checksum_func_name(opts.file_checksum_func_name) {} FileOptions& operator=(const FileOptions&) = default; }; // A structure to pass back some debugging information from the FileSystem // implementation to RocksDB in case of an IO error +// TODO(virajthakur): Update all calls to FS APIs for writes to pass in +// IODebugContext struct IODebugContext { // file_path to be filled in by RocksDB in case of an error std::string file_path; @@ -223,8 +250,9 @@ struct IODebugContext { // To be set by the FileSystem implementation std::string msg; - // To be set by the underlying FileSystem implementation. - std::string request_id; + // To be set by the application, to allow tracing logs/metrics from user -> + // RocksDB -> FS. + const std::string* request_id = nullptr; // In order to log required information in IO tracing for different // operations, Each bit in trace_data stores which corresponding info from @@ -240,16 +268,48 @@ struct IODebugContext { }; uint64_t trace_data = 0; + // Arbitrary structure containing cost information about the IO request + std::any cost_info; + + // FileSystem implementations can use this mutex to synchronize concurrent + // reads/writes as needed (e.g. to update the counters or cost_info field) + std::shared_mutex mutex; + IODebugContext() {} + // Copy constructor + IODebugContext(const IODebugContext& other) + : file_path(other.file_path), + counters(other.counters), + msg(other.msg), + trace_data(other.trace_data), + cost_info(other.cost_info), + _request_id(other.request_id ? *other.request_id : "") { + request_id = other.request_id ? &_request_id : nullptr; + } + + // Copy assignment operator + IODebugContext& operator=(const IODebugContext& other) { + if (this != &other) { + file_path = other.file_path; + counters = other.counters; + msg = other.msg; + trace_data = other.trace_data; + cost_info = other.cost_info; + _request_id = other.request_id ? *other.request_id : ""; + request_id = other.request_id ? &_request_id : nullptr; + } + return *this; + } + void AddCounter(std::string& name, uint64_t value) { counters.emplace(name, value); } // Called by underlying file system to set request_id and log request_id in // IOTracing. - void SetRequestId(const std::string& _request_id) { - request_id = _request_id; + void SetRequestId(const std::string* updated_request_id) { + request_id = updated_request_id; trace_data |= (1 << TraceData::kRequestID); } @@ -262,6 +322,12 @@ struct IODebugContext { ss << msg; return ss.str(); } + + private: + // Private member that allows for safe copying of IODebugContext without any + // memory ownership issues. After copying, request_id can point directly to + // this field. + std::string _request_id; }; // A function pointer type for custom destruction of void pointer passed to @@ -507,7 +573,7 @@ class FileSystem : public Customizable { } // This seems to clash with a macro on Windows, so #undef it here -#ifdef DeleteFile +#ifdef DeleteFile // ODR-SAFE #undef DeleteFile #endif // Delete the named file. @@ -668,7 +734,7 @@ class FileSystem : public Customizable { const ImmutableDBOptions& db_options) const; // This seems to clash with a macro on Windows, so #undef it here -#ifdef GetFreeSpace +#ifdef GetFreeSpace // ODR-SAFE #undef GetFreeSpace #endif @@ -699,7 +765,7 @@ class FileSystem : public Customizable { // Abort the read IO requests submitted asynchronously. Underlying FS is // required to support AbortIO API. AbortIO implementation should ensure that // the all the read requests related to io_handles should be aborted and - // it shouldn't call the callback for these io_handles. + // it should call the callback for these io_handles. virtual IOStatus AbortIO(std::vector& /*io_handles*/) { return IOStatus::OK(); } @@ -721,12 +787,13 @@ class FileSystem : public Customizable { // If async_io is supported by the underlying FileSystem, then supported_ops // will have corresponding bit (i.e FSSupportedOps::kAsyncIO) set to 1. // - // By default, async_io operation is set and FS should override this API and - // set all the operations they support provided in FSSupportedOps (including - // async_io). + // By default, async_io and prefetch operation are set and FS should override + // this API and set all the operations they support provided in FSSupportedOps + // (including async_io and prefetch). virtual void SupportedOps(int64_t& supported_ops) { supported_ops = 0; supported_ops |= (1 << FSSupportedOps::kAsyncIO); + supported_ops |= (1 << FSSupportedOps::kFSPrefetch); } // If you're adding methods here, remember to add them to EnvWrapper too. @@ -1006,6 +1073,14 @@ class FSRandomAccessFile { // open. virtual Temperature GetTemperature() const { return Temperature::kUnknown; } + // Get the file size on an open-for-reading file without re-seeking the file's + // path in the filesystem. The default implementation returns "not supported" + // so that user implementations of FSRandomAccessFile do not need to + // immediately implement this function. + virtual IOStatus GetFileSize(uint64_t* /*result*/) { + return IOStatus::NotSupported("GetFileSize Not Supported"); + } + // If you're adding methods here, remember to add them to // RandomAccessFileWrapper too. }; @@ -1106,8 +1181,10 @@ class FSWritableFile { // Truncate is necessary to trim the file to the correct size // before closing. It is not always possible to keep track of the file - // size due to whole pages writes. The behavior is undefined if called - // with other writes to follow. + // size due to whole pages writes. If called with other writes to follow, + // the behavior is file system specific. Posix will reseek to the new EOF. + // Other file systems may behave differently. Its the caller's + // responsibility to check the file system contract. virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/, IODebugContext* /*dbg*/) { return IOStatus::OK(); @@ -1727,6 +1804,10 @@ class FSRandomAccessFileWrapper : public FSRandomAccessFile { return target_->GetTemperature(); } + virtual IOStatus GetFileSize(uint64_t* result) override { + return target_->GetFileSize(result); + } + private: std::unique_ptr guard_; FSRandomAccessFile* target_; diff --git a/include/rocksdb/functor_wrapper.h b/include/rocksdb/functor_wrapper.h index 17b021bf73b5..50007b85d77a 100644 --- a/include/rocksdb/functor_wrapper.h +++ b/include/rocksdb/functor_wrapper.h @@ -44,7 +44,7 @@ void call(Function f, Tuple t) { template class FunctorWrapper { public: - explicit FunctorWrapper(std::function functor, Args &&...args) + explicit FunctorWrapper(std::function functor, Args&&... args) : functor_(std::move(functor)), args_(std::forward(args)...) {} void invoke() { detail::call(functor_, args_); } diff --git a/include/rocksdb/io_dispatcher.h b/include/rocksdb/io_dispatcher.h new file mode 100644 index 000000000000..6354d72ad36d --- /dev/null +++ b/include/rocksdb/io_dispatcher.h @@ -0,0 +1,358 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class FileSystem; +class Statistics; + +// Forward declaration for internal implementation +struct IODispatcherImplData; +struct PendingPrefetchRequest; + +// Options for configuring IODispatcher behavior +struct IODispatcherOptions { + // Maximum memory (in bytes) for prefetching across all ReadSets. + // When this limit is reached, SubmitJob() blocks until memory is released. + // Set to 0 (default) for unlimited prefetch memory. + size_t max_prefetch_memory_bytes = 0; + + // Optional statistics for tracking memory limiter metrics + Statistics* statistics = nullptr; +}; + +/* + * IODispatcher is a class that allows users to submit groups of IO jobs to be + * dispatched asynchronously (or synchronously), upon submission the + * IODispatcher will return a ReadSet which act as an ownership object of those + * IOs. Users read from their readset when they require the data, and either + * poll for completion of the block, or read synchronously if the block is not + * in cache at that point. + * + * ReadSets have RAII semantics, meaning on destruction they will cancel any on + * going IO, and release the underlying pinned blocks. + * + * IODispatcher main goal is to act as control plane for all readers using the + * dispatcher, allowing for future ratelimiting and smarter dispatching policies + * in the future. + * + * Example 1: Basic Usage + * ---------------------- + * // Submitting an IO job and reading blocks: + * // + * // std::shared_ptr job = std::make_shared(); + * // job->table = table_reader; // Provided BlockBasedTable* + * // job->job_options.io_coalesce_threshold = 32 * 1024; + * // job->job_options.read_options = read_options; // Provided ReadOptions + * // + * // // Populate the job with block handles (e.g., from an index/iterator) + * // job->block_handles.push_back(handle1); + * // job->block_handles.push_back(handle2); + * // job->block_handles.push_back(handle3); + * // + * // std::unique_ptr dispatcher(NewIODispatcher()); + * // std::shared_ptr read_set; + * // Status s = dispatcher->SubmitJob(job, &read_set); + * // if (!s.ok()) { + * // // Handle submit error + * // } + * // + * // // Read by index + * // for (size_t i = 1; i < job->block_handles.size(); ++i) { + * // CachableEntry block_entry; + * // Status rs = read_set->ReadIndex(i, &block_entry); + * // if (!rs.ok()) { + * // // Handle read error + * // continue; + * // } + * // // Use block_entry (block contents are pinned here) + * // } + * // + * // // Or read by byte offset + * // { + * // size_t offset = + static_cast(job->block_handles.front().offset()); + * // CachableEntry block_entry; + * // Status rs = read_set->ReadOffset(offset, &block_entry); + * // if (rs.ok()) { + * // // Use block_entry + * // } + * // } + * // + * // // Stats + * // uint64_t cache_hits = read_set->GetNumCacheHits(); + * // uint64_t async_reads = read_set->GetNumAsyncReads(); + * // uint64_t sync_reads = read_set->GetNumSyncReads(); + * + * Example 2: Memory-Limited Prefetching + * ------------------------------------- + * // Configure a memory budget for prefetching to prevent unbounded memory use. + * // When the budget is exceeded, IODispatcher uses "partial prefetch": + * // - Dispatches as many blocks as fit in available memory (earlier first) + * // - Queues remaining blocks for later dispatch when memory is released + * // - Never blocks on SubmitJob - remaining blocks are read on-demand + * // + * // IODispatcherOptions opts; + * // opts.max_prefetch_memory_bytes = 64 * 1024 * 1024; // 64MB budget + * // opts.statistics = db_options.statistics.get(); // Optional metrics + * // + * // std::unique_ptr dispatcher(NewIODispatcher(opts)); + * // + * // // Submit a job that needs more memory than available + * // // Partial prefetch will dispatch what fits immediately + * // std::shared_ptr read_set; + * // Status s = dispatcher->SubmitJob(job, &read_set); // Never blocks + * // + * // // Read blocks in order - earlier blocks are more likely to be prefetched + * // for (size_t i = 0; i < job->block_handles.size(); ++i) { + * // CachableEntry block; + * // Status rs = read_set->ReadIndex(i, &block); + * // // Use block... + * // + * // // Release block when done to free memory for pending prefetches + * // read_set->ReleaseBlock(i); // Triggers dispatch of queued blocks + * // } + * // + * // Memory limiting statistics (when statistics is configured): + * // - PREFETCH_MEMORY_BYTES_GRANTED: Total bytes acquired for prefetching + * // - PREFETCH_MEMORY_BYTES_RELEASED: Total bytes released after use + * // - PREFETCH_MEMORY_REQUESTS_BLOCKED: Number of blocks that couldn't be + * // prefetched immediately due to memory pressure + + */ + +class BlockHandle; +struct ReadOptions; +struct AsyncIOState; + +template +class CachableEntry; +class Block; +class BlockBasedTable; + +struct JobOptions { + uint64_t io_coalesce_threshold = 16 * 1024; + ReadOptions read_options; +}; + +class IOJob { + public: + std::vector block_handles; + + // Table reader for accessing block cache and index + BlockBasedTable* table = nullptr; + + // Job execution options + JobOptions job_options; +}; + +/* + * ReadSet represents a set of blocks that may be in cache, being read + * asynchronously, or need to be read synchronously. The Read() method + * transparently handles all three cases. + */ +class ReadSet { + public: + ReadSet() = default; + ~ReadSet(); + + ReadSet(const ReadSet&) = delete; + ReadSet& operator=(const ReadSet&) = delete; + ReadSet(ReadSet&&) noexcept = delete; + ReadSet& operator=(ReadSet&&) noexcept = delete; + + // Read a block by index + // - If the block is in cache, returns it immediately + // - If the block is being read asynchronously, polls for completion and + // returns it + // - If the block needs to be read, performs a synchronous read and returns it + // + // block_index: Index into the original IOJob's block_handles vector + // out: Output parameter for the pinned block entry + // + // Returns: Status::OK() on success, error status otherwise + Status ReadIndex(size_t block_index, CachableEntry* out); + // Read a block by offset + // - If the block is in cache, returns it immediately + // - If the block is being read asynchronously, polls for completion and + // returns it + // - If the block needs to be read, performs a synchronous read and returns it + + // block_offset: Byte Offset into the SST file of the block. + + // out: Output parameter for the pinned block entry + Status ReadOffset(size_t offset, CachableEntry* out); + + // Release a block by index, unpinning it from cache. + // After this call, ReadIndex() for this block will return an error. + // This is useful for eager memory reclamation when blocks are no longer + // needed. + void ReleaseBlock(size_t block_index); + + // Check if a block at the given index is still available (not released). + // Returns true if the block can be read, false otherwise. + bool IsBlockAvailable(size_t block_index) const; + + // Statistics accessors + uint64_t GetNumSyncReads() const { return num_sync_reads_; } + uint64_t GetNumAsyncReads() const { return num_async_reads_; } + uint64_t GetNumCacheHits() const { return num_cache_hits_; } + + private: + friend class IODispatcherImpl; + + // Job data + std::shared_ptr job_; + + // FileSystem for calling AbortIO in destructor + std::shared_ptr fs_; + + // Storage for pinned blocks (one per block handle in the job) + std::vector> pinned_blocks_; + + // Sorted index for binary search in ReadOffset. + // sorted_block_indices_[i] is the original index of the i-th smallest block + // by offset. Built once during SubmitJob for O(log n) ReadOffset lookups. + std::vector sorted_block_indices_; + + // Map from block index to async IO state for blocks being read + // asynchronously. Multiple block indices may map to the same async state when + // blocks are coalesced into a single IO request. + std::unordered_map> async_io_map_; + + // For memory release notifications back to dispatcher (weak ref to avoid + // cycles) + std::weak_ptr dispatcher_data_; + + // Size of each block (parallel to pinned_blocks_) for memory accounting + std::vector block_sizes_; + + // Statistics counters + std::atomic num_sync_reads_ = 0; + std::atomic num_async_reads_ = 0; + std::atomic num_cache_hits_ = 0; + + // Poll and process a specific async IO request + Status PollAndProcessAsyncIO( + const std::shared_ptr& async_state); + + // Perform synchronous read for a specific block + Status SyncRead(size_t block_index); + + // Remove a block from pending prefetch (called by ReadIndex/ReleaseBlock) + void RemoveFromPending(size_t block_index); + + // Atomic flags indicating if block is pending prefetch (lock-free check) + std::unique_ptr[]> pending_prefetch_flags_; + size_t pending_prefetch_flags_size_ = 0; + + // Reference to pending request (for removal notification) + std::shared_ptr pending_request_; +}; + +/* + * IODispatcher handles IO operations synchronously or asynchronously based + * on JobOptions. When async is true, it uses ReadAsync; when false, it uses + * standard synchronous reads. + * */ +class IODispatcher { + protected: + IODispatcher() = default; + + public: + virtual ~IODispatcher() {} + + IODispatcher(const IODispatcher&) = delete; + IODispatcher& operator=(const IODispatcher&) = delete; + IODispatcher(IODispatcher&&) = delete; + IODispatcher& operator=(IODispatcher&&) = delete; + + // Submit a job for IO processing + // job: The IO job to submit + // read_set: Output parameter that will be populated with the ReadSet on + // success Returns: Status::OK() on success, error status otherwise + virtual Status SubmitJob(const std::shared_ptr& job, + std::shared_ptr* read_set) = 0; +}; + +// Create IODispatcher with default options (no memory limit) +IODispatcher* NewIODispatcher(); + +// Create IODispatcher with custom options +IODispatcher* NewIODispatcher(const IODispatcherOptions& options); + +// TrackingIODispatcher wraps another IODispatcher and tracks all ReadSets +// created. This is useful for testing to verify IO statistics. +class TrackingIODispatcher : public IODispatcher { + public: + TrackingIODispatcher() : impl_(NewIODispatcher()) {} + explicit TrackingIODispatcher(IODispatcher* impl) : impl_(impl) {} + + Status SubmitJob(const std::shared_ptr& job, + std::shared_ptr* read_set) override { + Status s = impl_->SubmitJob(job, read_set); + if (s.ok() && read_set && *read_set) { + read_sets_.push_back(*read_set); + } + return s; + } + + // Get all ReadSets created by this dispatcher + const std::vector>& GetReadSets() const { + return read_sets_; + } + + // Get aggregated statistics from all ReadSets + uint64_t GetTotalSyncReads() const { + uint64_t total = 0; + for (const auto& rs : read_sets_) { + total += rs->GetNumSyncReads(); + } + return total; + } + + uint64_t GetTotalAsyncReads() const { + uint64_t total = 0; + for (const auto& rs : read_sets_) { + total += rs->GetNumAsyncReads(); + } + return total; + } + + uint64_t GetTotalCacheHits() const { + uint64_t total = 0; + for (const auto& rs : read_sets_) { + total += rs->GetNumCacheHits(); + } + return total; + } + + // Get total IO operations (sum of all types) + uint64_t GetTotalIOOperations() const { + return GetTotalSyncReads() + GetTotalAsyncReads() + GetTotalCacheHits(); + } + + // Clear tracked ReadSets + void ClearReadSets() { read_sets_.clear(); } + + private: + std::unique_ptr impl_; + std::vector> read_sets_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h index 592bc0c46709..8fce6181c0b4 100644 --- a/include/rocksdb/iostats_context.h +++ b/include/rocksdb/iostats_context.h @@ -32,22 +32,47 @@ struct FileIOByTemperature { uint64_t hot_file_bytes_read; // the number of bytes read to Temperature::kWarm file uint64_t warm_file_bytes_read; + // the number of bytes read to Temperature::kCool file + uint64_t cool_file_bytes_read; // the number of bytes read to Temperature::kCold file uint64_t cold_file_bytes_read; + // the number of bytes read to Temperature::kIce file + uint64_t ice_file_bytes_read; + // the number of bytes read to Temperature::kUnknown file not in last level + uint64_t unknown_non_last_level_bytes_read; + // the number of bytes read to Temperature::kUnknown file in last level + uint64_t unknown_last_level_bytes_read; // total number of reads to Temperature::kHot file uint64_t hot_file_read_count; // total number of reads to Temperature::kWarm file uint64_t warm_file_read_count; + // total number of reads to Temperature::kCool file + uint64_t cool_file_read_count; // total number of reads to Temperature::kCold file uint64_t cold_file_read_count; + // total number of reads to Temperature::kIce file + uint64_t ice_file_read_count; + // total number of reads to Temperature::kUnknown file not in last level + uint64_t unknown_non_last_level_read_count; + // total number of reads to Temperature::kUnknown file in last level + uint64_t unknown_last_level_read_count; + // reset all the statistics to 0. void Reset() { hot_file_bytes_read = 0; warm_file_bytes_read = 0; + cool_file_bytes_read = 0; cold_file_bytes_read = 0; + ice_file_bytes_read = 0; + unknown_non_last_level_bytes_read = 0; + unknown_last_level_bytes_read = 0; hot_file_read_count = 0; warm_file_read_count = 0; + cool_file_read_count = 0; cold_file_read_count = 0; + ice_file_read_count = 0; + unknown_non_last_level_read_count = 0; + unknown_last_level_read_count = 0; } }; diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index 51bead99b907..b006138376aa 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -21,6 +21,7 @@ #include #include "rocksdb/iterator_base.h" +#include "rocksdb/options.h" #include "rocksdb/wide_columns.h" namespace ROCKSDB_NAMESPACE { @@ -93,6 +94,22 @@ class Iterator : public IteratorBase { assert(false); return Slice(); } + + // Prepare the iterator to scan the ranges specified in scan_opts. This + // includes prefetching relevant blocks from disk. The upper bound and + // other table specific limits should be specified for each + // scan for best results. If an upper bound is not specified, Prepare may + // skip prefetching as it cannot accurately determine how much to prefetch. + // + // Prepare should typically be followed by Seeks to the start keys in the + // order they're specified in scan_opts. If the user does a Seek to some + // other target key, the iterator should disregard the scan_opts from that + // point onwards and behave like a normal iterator. Its the user's + // responsibility to again call Prepare(). + // + // If Prepare() is called, it overrides the iterate_upper_bound in + // ReadOptions + virtual void Prepare(const MultiScanArgs& /*scan_opts*/) {} }; // Return an empty iterator (yields nothing). diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h index 7a4c6ca11fbd..623fb1f0b918 100644 --- a/include/rocksdb/ldb_tool.h +++ b/include/rocksdb/ldb_tool.h @@ -32,10 +32,18 @@ struct LDBOptions { class LDBTool { public: + // DEPRECATED because this function does not return, which can result in + // memory leaks being reported because of the default Options() etc. not being + // destroyed. void Run( int argc, char** argv, Options db_options = Options(), const LDBOptions& ldb_options = LDBOptions(), const std::vector* column_families = nullptr); + + int RunAndReturn( + int argc, char** argv, const Options& db_options = Options(), + const LDBOptions& ldb_options = LDBOptions(), + const std::vector* column_families = nullptr); }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 019f4d40bf60..1b41ca81f3d9 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -439,6 +439,9 @@ struct CompactionJobInfo { // the job id, which is unique in the same thread. int job_id; + // the number of L0 files in the CF right before and after the compaction + int num_l0_files; + // the smallest input level of the compaction. int base_input_level; // the output level of the compaction. @@ -485,6 +488,9 @@ struct CompactionJobInfo { // Information about blob files deleted during compaction in Integrated // BlobDB. std::vector blob_file_garbage_infos; + + // Whether this compaction was aborted via AbortAllCompactions() + bool aborted = false; }; struct MemTableInfo { diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index fd63f127f468..00d08562762b 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -162,6 +163,12 @@ class MemTableRep { return true; } + // Only used after concurrent memtable inserts. + // This function will be called by each writer after all writes are done + // through InsertConcurrently(). + // This is used by VectorRep to do batched writes for concurrent inserts. + virtual void BatchPostProcess() {} + // Returns true iff an entry that compares equal to key is in the collection. virtual bool Contains(const char* key) const = 0; @@ -195,11 +202,12 @@ class MemTableRep { bool (*callback_func)(void* arg, const char* entry)); // Same as Get() but performs data integrity validation. - virtual Status GetAndValidate(const LookupKey& /* k */, - void* /* callback_args */, - bool (* /* callback_func */)(void* arg, - const char* entry), - bool /*allow_data_in_error*/) { + virtual Status GetAndValidate( + const LookupKey& /* k */, void* /* callback_args */, + bool (* /* callback_func */)(void* arg, const char* entry), + bool /* allow_data_in_error */, bool /* detect_key_out_of_order */, + const std::function& + /* key_validation_callback */) { return Status::NotSupported("GetAndValidate() not implemented."); } @@ -270,9 +278,11 @@ class MemTableRep { // Seek and perform integrity validations on the skip list. // Iterator becomes invalid and Corruption is returned if a // corruption is found. - virtual Status SeekAndValidate(const Slice& /* internal_key */, - const char* /* memtable_key */, - bool /* allow_data_in_errors */) { + virtual Status SeekAndValidate( + const Slice& /* internal_key */, const char* /* memtable_key */, + bool /* allow_data_in_errors */, bool /* detect_key_out_of_order */, + const std::function& + /* key_validation_callback */) { return Status::NotSupported("SeekAndValidate() not implemented."); } @@ -397,6 +407,11 @@ class SkipListFactory : public MemTableRepFactory { // the vector is sorted. This is useful for workloads where iteration is very // rare and writes are generally not issued after reads begin. // +// Concurrent inserts are supported by buffering writes in thread-local vectors +// for each write batch. To optimize performance for concurrent inserts, it is +// recommended to perform batched writes, and enable unordered_write (refer to +// the option comment for its impact on read consistency). +// // Parameters: // count: Passed to the constructor of the underlying std::vector of each // VectorRep. On initialization, the underlying array will be at least count @@ -418,6 +433,8 @@ class VectorRepFactory : public MemTableRepFactory { MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Allocator*, const SliceTransform*, Logger* logger) override; + + bool IsInsertConcurrentlySupported() const override { return true; } }; // This class contains a fixed array of buckets, each diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 4ab3842dda80..29e6b6dc575d 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -224,6 +224,20 @@ struct LevelMetaData { const std::vector files; }; +// Options for filtering column family metadata by key range. +struct GetColumnFamilyMetaDataOptions { + RangeOpt range; + + // The level to filter on. If -1, all levels are included. + int level = -1; + + GetColumnFamilyMetaDataOptions() = default; + + GetColumnFamilyMetaDataOptions(const OptSlice& _start_key, + const OptSlice& _end_key, int _level = -1) + : range(_start_key, _end_key), level(_level) {} +}; + // The metadata that describes a column family. struct ColumnFamilyMetaData { ColumnFamilyMetaData() : size(0), file_count(0), name("") {} @@ -239,6 +253,9 @@ struct ColumnFamilyMetaData { // The name of the column family. std::string name; // The metadata of all levels in this column family. + // levels[i] contains files in level i. + // For level 0, files with recent updates are ordered first. + // For level 1+, files are ordered by increasing key range. std::vector levels; // The total size of all blob files diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h new file mode 100644 index 000000000000..4b0917173701 --- /dev/null +++ b/include/rocksdb/multi_scan.h @@ -0,0 +1,248 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// EXPERIMENTAL +// +// An iterator that returns results from multiple scan ranges. The ranges are +// expected to be in increasing sorted order. +// The results are returned in nested container objects that can be iterated +// using an std::input_iterator. +// +// MultiScan +// | +// --- +// | +// MultiScanIterator <-- std::input_iterator (returns a Scan object for each +// | scan range) +// --- +// | +// Scan +// | +// --- +// | +// ScanIterator <-- std::input_iterator (returns the KVs of a single +// scan range) +// +// The application on top of RocksDB +// would use this as follows - +// +// std::vector scans{{.start = Slice("bar")}, +// {.start = Slice("foo")}}; +// std::unique_ptr iter.reset( +// db->NewMultiScan()); +// try { +// for (auto scan : *iter) { +// for (auto it : scan) { +// // Do something with key - it.first +// // Do something with value - it.second +// } +// } +// } catch (MultiScanException& ex) { +// // Check ex.status() +// } catch (std::logic_error& ex) { +// // Check ex.what() +// } + +class MultiScanException : public std::runtime_error { + public: + explicit MultiScanException(Status& s) + : std::runtime_error(s.ToString()), s_(s) {} + + Status& status() { return s_; } + + private: + Status s_; +}; + +// A container object encapsulating a single scan range. It supports an +// std::input_iterator for a single pass iteration of the KVs in the range. +// A Status exception is thrown if there is an error in scanning the range. +class Scan { + public: + class ScanIterator; + + explicit Scan(Iterator* db_iter) : db_iter_(db_iter) {} + + void Reset(Iterator* db_iter) { db_iter_ = db_iter; } + + ScanIterator begin() { return ScanIterator(db_iter_); } + + std::nullptr_t end() { return nullptr; } + + class ScanIterator { + public: + using self_type = ScanIterator; + using value_type = std::pair; + using reference = std::pair&; + using pointer = std::pair*; + using difference_type = int; + using iterator_category = std::input_iterator_tag; + + explicit ScanIterator(Iterator* db_iter) : db_iter_(db_iter) { + valid_ = db_iter_->Valid(); + if (valid_) { + result_ = value_type(db_iter_->key(), db_iter_->value()); + } + } + + ScanIterator() : db_iter_(nullptr), valid_(false) {} + + ~ScanIterator() { + if (!status_.ok()) { + fprintf(stderr, "ScanIterator status: %s\n", + status_.ToString().c_str()); + assert(false); + } + } + + ScanIterator& operator++() { + if (!valid_) { + throw std::logic_error("Trying to advance invalid iterator"); + } else { + db_iter_->Next(); + status_ = db_iter_->status(); + if (!status_.ok()) { + throw MultiScanException(status_); + } else { + valid_ = db_iter_->Valid(); + if (valid_) { + result_ = value_type(db_iter_->key(), db_iter_->value()); + } + } + } + return *this; + } + + bool operator==(std::nullptr_t /*other*/) const { return !valid_; } + + bool operator!=(std::nullptr_t /*other*/) const { return valid_; } + + reference operator*() { + if (!valid_) { + throw std::logic_error("Trying to deref invalid iterator"); + } + return result_; + } + reference operator->() { + if (!valid_) { + throw std::logic_error("Trying to deref invalid iterator"); + } + return result_; + } + + private: + Iterator* db_iter_; + bool valid_; + Status status_; + value_type result_; + }; + + private: + Iterator* db_iter_; +}; + +// A container object encapsulating the scan ranges for a multi scan. +// It supports an std::input_iterator for a single pass iteration of the +// ScanOptions in scan_opts, which can be dereferenced to get the container +// (Scan) for a single range. +// A Status exception is thrown if there is an error. +class MultiScan { + public: + MultiScan(const ReadOptions& read_options, const MultiScanArgs& scan_opts, + DB* db, ColumnFamilyHandle* cfh); + + explicit MultiScan(const Comparator* comp, + std::unique_ptr&& db_iter) + : scan_opts_(comp), db_iter_(std::move(db_iter)) {} + + class MultiScanIterator { + public: + MultiScanIterator(const MultiScanIterator&) = delete; + MultiScanIterator operator=(MultiScanIterator&) = delete; + + using self_type = MultiScanIterator; + using value_type = Scan; + using reference = Scan&; + using pointer = Scan*; + using difference_type = int; + using iterator_category = std::input_iterator_tag; + + MultiScanIterator(const std::vector& scan_opts, DB* db, + ColumnFamilyHandle* cfh, ReadOptions& read_options, + Slice* upper_bound, std::unique_ptr& db_iter) + : scan_opts_(scan_opts), + db_(db), + cfh_(cfh), + read_options_(read_options), + upper_bound_(upper_bound), + idx_(0), + db_iter_(db_iter), + scan_(db_iter_.get()) { + if (scan_opts_.empty()) { + throw std::logic_error("Zero scans in multi-scan"); + } + status_ = db_iter_->status(); + if (!status_.ok()) { + throw MultiScanException(status_); + } + db_iter_->Seek(*scan_opts_[idx_].range.start); + status_ = db_iter_->status(); + if (!status_.ok()) { + throw MultiScanException(status_); + } + } + + ~MultiScanIterator() { assert(status_.ok()); } + + MultiScanIterator& operator++(); + + bool operator==(std::nullptr_t /*other*/) const { + return idx_ >= scan_opts_.size(); + } + + bool operator!=(std::nullptr_t /*other*/) const { + return idx_ < scan_opts_.size(); + } + + reference operator*() { return scan_; } + reference operator->() { return scan_; } + + private: + const std::vector& scan_opts_; + DB* db_; + ColumnFamilyHandle* cfh_; + ReadOptions& read_options_; + Slice* upper_bound_; + size_t idx_; + std::unique_ptr& db_iter_; + Status status_; + Scan scan_; + }; + + MultiScanIterator begin() { + return MultiScanIterator(scan_opts_.GetScanRanges(), db_, cfh_, + read_options_, &upper_bound_, db_iter_); + } + + std::nullptr_t end() { return nullptr; } + + private: + ReadOptions read_options_; + const MultiScanArgs scan_opts_; + DB* db_; + ColumnFamilyHandle* cfh_; + Slice upper_bound_; + std::unique_ptr db_iter_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 796de1fef086..3c0898fdc82b 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -32,7 +32,7 @@ #include "rocksdb/version.h" #include "rocksdb/write_buffer_manager.h" -#ifdef max +#ifdef max // ODR-SAFE #undef max #endif @@ -57,11 +57,15 @@ class Statistics; class InternalKeyComparator; class WalFilter; class FileSystem; +class UserDefinedIndexFactory; +class IODispatcher; struct Options; struct DbPath; using FileTypeSet = SmallEnumSet; +using CompactionStyleSet = + SmallEnumSet; struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // The function recovers options to a previous version. Only 4.6 or later @@ -231,6 +235,14 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // different options for compression algorithms CompressionOptions compression_opts; + // EXPERIMENTAL + // Customized compression through a callback interface. When non-nullptr, + // supersedes the above compression options, except that the above options are + // still processed as they historically would be and passed to + // CompressionManager::GetCompressorForSST as hints or suggestions. See + // advanced_compression.h + std::shared_ptr compression_manager; + // Number of files to trigger level-0 compaction. A value <0 means that // level-0 compaction will not be triggered by number of files at all. // @@ -293,9 +305,6 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint64_t max_bytes_for_level_base = 256 * 1048576; - // Deprecated. - uint64_t snap_refresh_nanos = 0; - // Disable automatic compactions. Manual compactions can still // be issued on this column family // @@ -454,6 +463,7 @@ extern const char* kHostnameForDbHostId; enum class CompactionServiceJobStatus : char { kSuccess, kFailure, + kAborted, kUseLocal, }; @@ -461,6 +471,12 @@ struct CompactionServiceJobInfo { std::string db_name; std::string db_id; std::string db_session_id; + + // the id of the column family where the compaction happened. + uint32_t cf_id; + // the name of the column family where the compaction happened. + std::string cf_name; + uint64_t job_id; // job_id is only unique within the current DB and session, // restart DB will reset the job_id. `db_id` and // `db_session_id` could help you build unique id across @@ -474,21 +490,34 @@ struct CompactionServiceJobInfo { bool is_manual_compaction; bool bottommost_level; + // the smallest input level of the compaction. + // (same as Compaction::start_level and CompactionJobInfo::base_input_level) + int base_input_level; + // the output level of the compaction. + int output_level; + + CompactionServiceJobInfo() {} CompactionServiceJobInfo(std::string db_name_, std::string db_id_, - std::string db_session_id_, uint64_t job_id_, + std::string db_session_id_, uint32_t cf_id_, + std::string cf_name_, uint64_t job_id_, Env::Priority priority_, CompactionReason compaction_reason_, bool is_full_compaction_, bool is_manual_compaction_, - bool bottommost_level_) + bool bottommost_level_, int base_input_level_, + int output_level_) : db_name(std::move(db_name_)), db_id(std::move(db_id_)), db_session_id(std::move(db_session_id_)), + cf_id(cf_id_), + cf_name(std::move(cf_name_)), job_id(job_id_), priority(priority_), compaction_reason(compaction_reason_), is_full_compaction(is_full_compaction_), is_manual_compaction(is_manual_compaction_), - bottommost_level(bottommost_level_) {} + bottommost_level(bottommost_level_), + base_input_level(base_input_level_), + output_level(output_level_) {} }; struct CompactionServiceScheduleResponse { @@ -579,12 +608,20 @@ struct DBOptions { // checksums. True also enters a read-only mode when a DB write fails; // see DB::Resume(). // + // When set to true, the DB will fail to open if any SST files fail to open + // e.g. due to incorrect file size or corrupted footer. + // + // When set to false, when there are files corrupted, the DB will still be + // opened, and the healthy ones could still be accessed, while corrupted one + // will not + // // As most workloads value data correctness over availability, this option // is on by default. Note that the name of this old option is potentially // misleading, and other options and operations go further in proactive // checking for corruption, including // * paranoid_file_checks // * paranoid_memory_checks + // * memtable_veirfy_per_key_checksum_on_seek // * DB::VerifyChecksum() // // Default: true @@ -593,7 +630,8 @@ struct DBOptions { // DEPRECATED: This option might be removed in a future release. // // If true, during memtable flush, RocksDB will validate total entries - // read in flush, and compare with counter inserted into it. + // read in flush, total entries written in the SST and compare them with + // counter of keys added. // // The option is here to turn the feature off in case this new validation // feature has a bug. The option may be removed in the future once the @@ -812,6 +850,7 @@ struct DBOptions { // If it is non empty, the log files will be in the specified dir, // and the db data dir's absolute path will be used as the log file // name's prefix. + // NOTE: not for WALs std::string db_log_dir = ""; // This specifies the absolute dir path for write-ahead logs (WAL). @@ -892,21 +931,24 @@ struct DBOptions { // be created. // If max_log_file_size == 0, all logs will be written to one // log file. + // NOTE: not for WALs size_t max_log_file_size = 0; // Time for the info log file to roll (in seconds). // If specified with non-zero value, log file will be rolled // if it has been active longer than `log_file_time_to_roll`. // Default: 0 (disabled) + // NOTE: not for WALs size_t log_file_time_to_roll = 0; // Maximal info log files to be kept. // Default: 1000 + // NOTE: not for WALs size_t keep_log_file_num = 1000; - // Recycle log files. - // If non-zero, we will reuse previously written log files for new - // logs, overwriting the old data. The value indicates how many + // Recycle WAL files. + // If non-zero, we will reuse previously written WAL files for new + // WALs, overwriting the old data. The value indicates how many // such files we will keep around at any point in time for later // use. This is more efficient because the blocks are already // allocated and fdatasync does not need to update the inode after @@ -914,12 +956,67 @@ struct DBOptions { // Default: 0 size_t recycle_log_file_num = 0; - // manifest file is rolled over on reaching this limit. - // The older manifest file be deleted. - // The default value is 1GB so that the manifest file can grow, but not - // reach the limit of storage capacity. + // The manifest file is rolled over on reaching this limit AND the + // space amp limit described in max_manifest_space_amp_pct. More trade-off + // details there. + // + // NOTE: this option used to be a hard limit, but that made this a dangerous + // tuning parameter for optimizing manifest file size because the best + // size really depends on the DB size and average SST file size (and other + // settings). Now it is essentially a minimum for the auto-tuned max manifest + // file size. + // + // Until the max_manifest_space_amp_pct feature is fully validated to show a + // smaller default here like 1MB is appropriate, the default value is 1GB to + // match historical behavior (without it being a hard limit in case of giant + // compacted manifest size). + // + // This option is mutable with SetDBOptions(), taking effect on the next + // manifest write (e.g. completed DB compaction or flush). uint64_t max_manifest_file_size = 1024 * 1024 * 1024; + // This option mostly replaces max_manifest_file_size to control an auto-tuned + // balance of manifest write amplification and space amplification. A new + // manifest file is created with the "compacted" contents of the old one when + // current_manifest_size + // > + // max(max_manifest_file_size, + // est_compacted_manifest_size * (1 + max_manifest_space_amp_pct/100)) + // + // where est_compacted_manifest_size is an estimate of how big a new compacted + // version of the current manifest would be. Currently, the estimate used is + // the last newly-written manifest, in its "compacted" form. + // + // Space amplification in the manifest file might be less of a concern for + // primary storage space and more of a concern for DB recover time and size of + // backup files that aren't incremental between backups. To minimize manifest + // churn on initial DB population, setting max_manifest_file_size to something + // not too small, like 1MB, should suffice. Similarly, write amp on the + // manifest file is likely not a direct concern but completed compactions and + // flushes cannot (currently) be committed while the (relatively small) + // manifest file is being compacted. Manifest compactions should not + // interfere with user write latency or throughput unless the DB is + // chronically stalling or close to stalling writes already. + // + // For this option to have a meaningful effect, it is recommended to set + // max_manifest_file_size to something modest like 1MB. Then we can interpret + // values for this option as follows, starting with minimum space amp and + // maximum write amp: + // * 0 - Every manifest write (flush, compaction, etc.) generates a whole new + // manifest. Only useful for testing. + // * very small - Doesn't take many manifest writes to generate a whole new + // manifest. + // * 100 - In a DB with pretty consistent number of SST files, etc., achieves + // about 1.0 write amp (writing about 2x the theoretical minimum) and a max of + // about 1.0 space amp (manifest up to 2x the compacted size). + // * 500 - Recommended and default: 0.2 write amp and up to roughly 5.0 space + // amp. + // * 10000 - 0.01 write amp and up to 100 space amp on the manifest. + // + // This option is mutable with SetDBOptions(), taking effect on the next + // manifest write (e.g. completed DB compaction or flush). + int max_manifest_space_amp_pct = 500; + // Number of shards used for table cache. int table_cache_numshardbits = 6; @@ -1263,16 +1360,6 @@ struct DBOptions { // Default: false bool skip_stats_update_on_db_open = false; - // If true, then DB::Open() will not fetch and check sizes of all sst files. - // This may significantly speed up startup if there are many sst files, - // especially when using non-default Env with expensive GetFileSize(). - // We'll still check that all required sst files exist. - // If paranoid_checks is false, this option is ignored, and sst files are - // not checked at all. - // - // Default: false - bool skip_checking_sst_file_sizes_on_db_open = false; - // Recovery mode to control the consistency while replaying WAL // Default: kPointInTimeRecovery WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; @@ -1294,14 +1381,6 @@ struct DBOptions { // currently. WalFilter* wal_filter = nullptr; - // DEPRECATED: This option might be removed in a future release. - // - // If true, then DB::Open, CreateColumnFamily, DropColumnFamily, and - // SetOptions will fail if options file is not properly persisted. - // - // DEFAULT: true - bool fail_if_options_file_error = true; - // If true, then print malloc stats together with rocksdb.stats // when printing to LOG. // DEFAULT: false @@ -1325,16 +1404,11 @@ struct DBOptions { // Dynamically changeable through SetDBOptions() API. bool avoid_flush_during_shutdown = false; - // Set this option to true during creation of database if you want - // to be able to ingest behind (call IngestExternalFile() skipping keys - // that already exist, rather than overwriting matching keys). - // Setting this option to true has the following effects: - // 1) Disable some internal optimizations around SST file compression. - // 2) Reserve the last level for ingested files only. - // 3) Compaction will not include any file from the last level. - // Note that only Universal Compaction supports allow_ingest_behind. - // `num_levels` should be >= 3 if this option is turned on. + // DEPRECATED: use ColumnFamilyOptions::cf_allow_ingest_behind instead. + // This option might be removed in a future release. // + // See comment for `ColumnFamilyOptions::cf_allow_ingest_behind` for + // detail about the option's functionality and use cases. // // DEFAULT: false // Immutable. @@ -1420,9 +1494,10 @@ struct DBOptions { // prefix_same_as_start=true can take advantage of prefix seek optimizations. bool prefix_seek_opt_in_only = false; - // The number of bytes to prefetch when reading the log. This is mostly useful - // for reading a remotely located log, as it can save the number of - // round-trips. If 0, then the prefetching is disabled. + // The number of bytes to prefetch when reading the DB manifest and WAL files + // during DB::Open (and variants). This is mostly useful for reading a + // remotely located log, as it can save the number of round-trips. If 0, then + // the prefetching is disabled. // // Default: 0 size_t log_readahead_size = 0; @@ -1619,6 +1694,24 @@ struct DBOptions { // `kUnknown`, this overrides any temperature set by OptimizeForLogWrite // functions. Temperature wal_write_temperature = Temperature::kUnknown; + + // Enum set indicative of which compaction styles SST write lifetime hint + // calculation is allowed on. Today, RocksDB provides native support for + // kCompactionStyleLevel and kCompactionStyleUniversal (experimental version). + // Other compaction styles, even when enabled in the set, won't have any + // effect in the default PosixWritableFile file implementation. There are + // numerous benefits coming from employing the hints including reduction in + // write amplification caused by OS file movement during garbage collection, + // and reduction in wear-leveling (SSDs). However, as currently implemented, + // SST write lifetime hints are calculated in a static way and solely based on + // the level, which might not be suitable for non-uniform workloads with + // dynamic / high-variance lifespan of data within the same level. In those + // cases (or when the performance is not satisfactory), it's recommended to + // disable the hints by assigning the setting to the empty set (= {}); + // + // Default: Enabled in kCompactionStyleLevel mode. + CompactionStyleSet calculate_sst_write_lifetime_hint_set = { + CompactionStyle::kCompactionStyleLevel}; // End EXPERIMENTAL }; @@ -1682,6 +1775,174 @@ enum ReadTier { kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators. }; +// A range of keys. In case of user_defined timestamp, if enabled, `start` and +// `limit` should point to key without timestamp part. +struct Range { + Slice start; + Slice limit; + + Range() {} + Range(const Slice& s, const Slice& l) : start(s), limit(l) {} +}; + +// A key range with optional endpoints. In case of user_defined timestamp, if +// enabled, `start` and `limit` should point to key without timestamp part. +struct RangeOpt { + // When start.has_value() == false, refers to starting before every key + OptSlice start; + // When limit.has_value() == false, refers to ending after every key + OptSlice limit; + + RangeOpt() {} + RangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {} +}; + +// EXPERIMENTAL +// +// Options for a RocksDB scan request. Only forward scans for now. +// We may add other options such as prefix scan in the future. +struct ScanOptions { + // The scan range. Mandatory for start to be set, limit is optional + RangeOpt range; + + // A map of name,value pairs that can be passed by the user to an + // external table reader. This is completely opaque to RocksDB and is + // ignored by the natively supported table readers like block based and plain + // table. This is only useful for Iterator. + std::optional> property_bag; + + // An unbounded scan with a start key + explicit ScanOptions(const Slice& _start) : range(_start, OptSlice()) {} + + // A bounded scan with a start key and upper bound + ScanOptions(const Slice& _start, const Slice& _upper_bound) + : range(_start, _upper_bound) {} +}; + +// Container for multiple scan ranges that can be used with MultiScan. +// This replaces std::vector with a more efficient implementation +// that can merge overlapping ranges. +class MultiScanArgs { + public: + // Constructor that takes a comparator + explicit MultiScanArgs(const Comparator* comparator) : comp_(comparator) {} + + // Copy Constructor + MultiScanArgs(const MultiScanArgs& other) { + comp_ = other.comp_; + original_ranges_ = other.original_ranges_; + io_coalesce_threshold = other.io_coalesce_threshold; + max_prefetch_size = other.max_prefetch_size; + use_async_io = other.use_async_io; + io_dispatcher = other.io_dispatcher; + } + MultiScanArgs(MultiScanArgs&& other) noexcept + : io_coalesce_threshold(other.io_coalesce_threshold), + max_prefetch_size(other.max_prefetch_size), + use_async_io(other.use_async_io), + io_dispatcher(std::move(other.io_dispatcher)), + comp_(other.comp_), + original_ranges_(std::move(other.original_ranges_)) {} + + MultiScanArgs& operator=(const MultiScanArgs& other) { + comp_ = other.comp_; + original_ranges_ = other.original_ranges_; + io_coalesce_threshold = other.io_coalesce_threshold; + max_prefetch_size = other.max_prefetch_size; + use_async_io = other.use_async_io; + io_dispatcher = other.io_dispatcher; + return *this; + } + + MultiScanArgs& operator=(MultiScanArgs&& other) noexcept { + if (this != &other) { + comp_ = other.comp_; + original_ranges_ = std::move(other.original_ranges_); + io_coalesce_threshold = other.io_coalesce_threshold; + max_prefetch_size = other.max_prefetch_size; + use_async_io = other.use_async_io; + io_dispatcher = std::move(other.io_dispatcher); + } + return *this; + } + + void insert(const Slice& s, const Slice& b) { + original_ranges_.emplace_back(s, b); + } + + void insert(const Slice& s, const Slice& b, + const std::optional>& + property_bag) { + original_ranges_.emplace_back(s, b); + original_ranges_.back().property_bag = property_bag; + } + + void insert(const Slice& s) { original_ranges_.emplace_back(s); } + + void insert(const Slice& s, + const std::optional>& + property_bag) { + original_ranges_.emplace_back(s); + original_ranges_.back().property_bag = property_bag; + } + + size_t size() const { return original_ranges_.size(); } + bool empty() const { return original_ranges_.empty(); } + + void reserve(size_t size) { original_ranges_.reserve(size); } + + operator std::vector*() { return &original_ranges_; } + + operator const std::vector*() const { return &original_ranges_; } + + ~MultiScanArgs() {} + + const std::vector& GetScanRanges() const { + return original_ranges_; + } + + const Comparator* GetComparator() const { return comp_; } + + // Copies the configurations (excluding actual scan ranges) from another + // MultiScanArgs. + void CopyConfigFrom(const MultiScanArgs& other) { + io_coalesce_threshold = other.io_coalesce_threshold; + max_prefetch_size = other.max_prefetch_size; + use_async_io = other.use_async_io; + io_dispatcher = other.io_dispatcher; + } + + uint64_t io_coalesce_threshold = 16 << 10; // 16KB by default + + // Maximum size (in bytes) for the data blocks loaded by a MultiScan. + // This limits the amount of I/O and memory usage by pinned data blocks. + // + // When set to 0 (the default), there is no limit. When the limit is reached, + // the iterator will start returning Status::PrefetchLimitReached(). + // + // Note that prefetching happens only once in Prepare(), which is different + // from ReadOptions::readahead_size, which applies any time the iterator does + // I/O. + // Note that this limit is per file and applies to compressed block size. + uint64_t max_prefetch_size = 0; + + // Enable async I/O for multi-scan operations + // When true, BlockBasedTableIterator will use ReadAsync() for reading blocks + // When false, it will use synchronous MultiRead(). + bool use_async_io = false; + + // Optional IODispatcher for multi-scan operations. + // If nullptr (default), a new IODispatcher is created internally. + // Users can provide their own IODispatcher for custom IO scheduling + // or for testing/monitoring purposes (e.g., to check IO statistics). + std::shared_ptr io_dispatcher = nullptr; + + private: + // The comparator used for ordering ranges + const Comparator* comp_; + std::vector original_ranges_; +}; + // Options that control read operations struct ReadOptions { // *** BEGIN options relevant to point lookups as well as scans *** @@ -1763,6 +2024,10 @@ struct ReadOptions { // block cache. bool fill_cache = true; + // DEPRECATED: This option might be removed in a future release. + // There should be no noticeable performance difference whether this option + // is turned on or off when a DB does not use DeleteRange(). + // // If true, range tombstones handling will be skipped in key lookup paths. // For DB instances that don't use DeleteRange() calls, this setting can // be used to optimize the read performance. @@ -1841,10 +2106,6 @@ struct ReadOptions { // that were inserted into the database after the creation of the iterator. bool tailing = false; - // This options is not used anymore. It was to turn on a functionality that - // has been removed. DEPRECATED - bool managed = false; - // Enable a total order seek regardless of index format (e.g. hash index) // used in the table. Some table format (e.g. plain table) may not support // this option. @@ -1968,6 +2229,17 @@ struct ReadOptions { // Default: false bool auto_refresh_iterator_with_snapshot = false; + // EXPERIMENTAL + // + // Specify an alternate index to use in the SST files instead of the native + // block based table index. The table_factory used for the column family + // must support building/reading this index. + // + // Currently, only forward scans are supported. For forward scans, only Seek() + // is supported. SeekToFirst() is not supported. If the caller wishes to scan + // from start to end, the native index must be used. + const UserDefinedIndexFactory* table_index_factory = nullptr; + // *** END options only relevant to iterators or scans *** // *** BEGIN options for RocksDB internal use only *** @@ -1975,18 +2247,21 @@ struct ReadOptions { // EXPERIMENTAL Env::IOActivity io_activity = Env::IOActivity::kUnknown; - // EXPERIMENTAL - // An optional weight of values to be returned by a scan. Once the - // weight is reached or exceeded the scan is terminated (i.e Next() - // invalidates the iterator). In the case of a DB with one of the built-in - // table formats, such as BlockBasedTable, the weight is simply the number - // of key-value pairs. In the case of an ExternalTableReader, the weight is - // passed through to the table reader and the interpretation is upto the - // reader implementation. - uint64_t weight = 0; - // *** END options for RocksDB internal use only *** + // *** BEGIN per-request settings for internal team use only *** + + // TODO: create a new struct for per-request options, potentially including + // timestamps in point lookups/scans + + // request_id is a unique id assigned by the application. It is used to allow + // us to link file system metrics/logs to rocksDB and application logs. This + // request_id may not be unique to each RocksDB api call - it could refer to + // an application level request that results in multiple RocksDB api calls + const std::string* request_id = nullptr; + + // *** END per-request settings for internal team use only *** + ReadOptions() {} ReadOptions(bool _verify_checksums, bool _fill_cache); explicit ReadOptions(Env::IOActivity _io_activity); @@ -2098,6 +2373,23 @@ struct FlushOptions { FlushOptions() : wait(true), allow_write_stall(false) {} }; +struct FlushWALOptions { + // If true, it calls `SyncWAL()` afterwards. + // Default: false + bool sync; + + // For IO operations associated with flushing the WAL, charge the internal + // rate limiter (see `DBOptions::rate_limiter`) at the specified priority and + // pass the priority down to the file system through + // `IOOptions::rate_limiter_priority`. The special value `Env::IO_TOTAL` + // disables charging the rate limiter. + // + // Default: `Env::IO_TOTAL` + Env::IOPriority rate_limiter_priority; + + FlushWALOptions() : sync(false), rate_limiter_priority(Env::IO_TOTAL) {} +}; + // Create a Logger from provided DBOptions Status CreateLoggerFromOptions(const std::string& dbname, const DBOptions& options, @@ -2126,10 +2418,31 @@ struct CompactionOptions { // If > 0, it will replace the option in the DBOptions for this compaction. uint32_t max_subcompactions; + // Allows cancellation of an in-progress manual compaction. + // + // Cancellation can be delayed waiting on automatic compactions when used + // together with `exclusive_manual_compaction == true`. + std::atomic* canceled; + // NOTE: Calling DisableManualCompaction() will not override the + // canceled variable in CompactionOptions, as it does for CompactRangeOptions + // - this is because ManualCompactionState is not used + + // Create output compaction file using this file temperature. If unset, will + // default to "last_level_temperature" if output level is last level otherwise + // "default_write_temperature" + Temperature output_temperature_override = Temperature::kUnknown; + + // Option to optimize the manual compaction by enabling trivial move for non + // overlapping files. + // Default: false + bool allow_trivial_move; + CompactionOptions() : compression(kDisableCompressionOption), output_file_size_limit(std::numeric_limits::max()), - max_subcompactions(0) {} + max_subcompactions(0), + canceled(nullptr), + allow_trivial_move(false) {} }; // For level based compaction, we can configure if we want to skip/force @@ -2196,7 +2509,7 @@ struct CompactRangeOptions { // Cancellation can be delayed waiting on automatic compactions when used // together with `exclusive_manual_compaction == true`. std::atomic* canceled = nullptr; - // NOTE: Calling DisableManualCompaction() overwrites the uer-provided + // NOTE: Calling DisableManualCompaction() overwrites the user-provided // canceled variable in CompactRangeOptions. // Typically, when CompactRange is being called in one thread (t1) with // canceled = false, and DisableManualCompaction is being called in the @@ -2218,7 +2531,47 @@ struct CompactRangeOptions { double blob_garbage_collection_age_cutoff = -1; }; -// IngestExternalFileOptions is used by IngestExternalFile() +// IngestExternalFileOptions setting guide: +// +// The options in IngestExternalFileOptions interact in complex ways depending +// on the source and overlap of SST files. Below is a summary of recommended +// non-default settings for common use cases: +// +// 1. Ingesting only SST writer generated non-overlapping SSTs that are not +// expected to overlap with existing data: +// - Optionally set fail_if_not_bottommost_level = true to enforce placement +// in the last level. This is better paird with SST partitioner to guarantee +// that there are no existing file with keys across the ingesting key range. +// - Set allow_blocking_flush to false: Not expecting to overlap with +// memtable and cause a flush. +// - If snapshot consistency is not expected, set snapshot_consistency to +// false and allow_global_seqno to false. allow_global_seqno = false will +// fail ingestion if any input file overlap with each other. +// +// 2. Ingesting SST writer generated overlapping SSTs: +// - order files with older updates first, newer overwrites later. +// - Set allow_global_seqno = true since newer files need to be assigned +// larger sequence numbers. +// +// 3. Ingesting DB generated SSTs: overlapping with target CF data is not +// allowed. Input files are allowed to contain both DB generated files and SST +// file writer generated files. They will all be treated as DB generated. +// - Set allow_db_generated_files = true. +// - Set snapshot_consistency = false: snapshot consistency requires +// assigning higher sequence number to ingested files. DB generated files +// don't support global seqno assignment yet. +// - Set allow_blocking_flush to false: Not expecting to overlap with +// memtable and cause a flush. +// - If the source live DB is running, set link_files = true instead of +// move_files. +// 3a) SST files are non-overlapping and all keys have seqno 0: e.g., a +// temporary RocksDB instance used to sort some data, and compacts all +// data into the last level before ingestion. +// - Optionally set fail_if_not_bottommost_level = true to enforce placement +// in the last level. +// 3b) SST files are overlapping, e.g. ingesting files from one CF to another. +// - Ensure older updates are ordered first and newer updates are ordered +// later. See more in option comment for allow_db_generated_files. struct IngestExternalFileOptions { // Can be set to true to move the files instead of copying them. // The input files will be unlinked after successful ingestion. @@ -2235,20 +2588,33 @@ struct IngestExternalFileOptions { // If set to false, an ingested file keys could appear in existing snapshots // that where created before the file was ingested. bool snapshot_consistency = true; - // If set to false, IngestExternalFile() will fail if the file key range + // Enables assiging a global sequence number to each ingested file, i.e., + // all keys in the ingested file will be treated as having this seqno. + // If set to false, we will use the sequence numbers in the ingested file + // as is, and IngestExternalFile() will fail if the ingested key range // overlaps with existing keys or tombstones or output of ongoing compaction - // during file ingestion in the DB (the conditions under which a global_seqno - // must be assigned to the ingested file). + // in the CF (the conditions under which a global seqno must be assigned to + // the ingested file). + // If the ingested files overlap with each other, we need to assign global + // sequence to the ingested files and this option needs to be enabled. One + // exception to this is when ingesting DB generated SST files (see option + // allow_db_generated_files below). DB generated files do not support + // global seqno assignment and can be ingested even if they overlap with + // each other. This option has no effect when allow_db_generated_files is + // enabled. bool allow_global_seqno = true; - // If set to false and the file key range overlaps with the memtable key range - // (memtable flush required), IngestExternalFile will fail. + // Normally (true), IngestExternalFile() will trigger and block for flushing + // memtable(s) if there is overlap between ingested files and memtable(s). If + // allow_blocking_flush is set to false, IngestExternalFile() will fail if the + // file key range overlaps with the memtable key range (memtable flush + // required). bool allow_blocking_flush = true; // Set to true if you would like duplicate keys in the file being ingested // to be skipped rather than overwriting existing data under that key. // Use case: back-fill of some historical data in the database without // over-writing existing newer version of data. - // This option could only be used if the DB has been running - // with allow_ingest_behind=true since the dawn of time. + // This option could only be used if the CF has been running + // with cf_allow_ingest_behind=true since CF creation (or before any write). // All files will be ingested at the bottommost level with seqno=0. bool ingest_behind = false; // DEPRECATED - Set to true if you would like to write global_seqno to @@ -2301,18 +2667,53 @@ struct IngestExternalFileOptions { // // XXX: "bottommost" is obsolete/confusing terminology to refer to last level bool fail_if_not_bottommost_level = false; - // EXPERIMENTAL - // Enables ingestion of files not generated by SstFileWriter. When true: + // EXPERIMENTAL, SUBJECT TO CHANGE + // + // Enables special mode of ingestion that allows files generated by a live DB, + // instead of SstFileWriter. When true: // - Allows files to be ingested when their cf_id doesn't match the CF they // are being ingested into. + // - Allows files with any sequence numbers to be ingested. + // - Original sequence numbers are preserved (no reassignment). + // // REQUIREMENTS: - // - Ingested files must not overlap with existing keys. - // - `write_global_seqno` must be false. - // - All keys in ingested files should have sequence number 0. We fail - // ingestion if any sequence numbers is non-zero. - // WARNING: If a DB contains ingested files generated by another DB/CF, - // RepairDB() may not recover these files correctly, potentially leading to - // data loss. + // - Ingested files must NOT overlap with any existing data in the DB. Since + // no sequence number reassignment is performed on db generated files. + // Ingestion will fail if any overlap is detected. However, input files + // are allowed to overlap with each other when this option is enabled. This + // is useful when ingesting multiple levels of files from a CF, where + // levels naturally overlap with each other. + // - CAUTION: If input files overlap with each other, then for any given user + // key appearing in multiple files, earlier files MUST have smaller sequence + // numbers than later files. Later files will be placed at a higher level + // (smaller level number). This is to ensure the LSM invariant where for + // the same key, recent updates are in higher levels. This means that + // if you are ingesting files from multiple levels of a CF, you should + // put files from lower levels first, and files from higher levels later. + // Example for getting files from a CF for ingestion: + // + // ColumnFamilyMetaData cf_meta; + // from_db->GetColumnFamilyMetaData(from_cf, &cf_meta); + // // iterate in reverse to start from lowest level + // for (auto level_meta = cf_meta.levels.rbegin(); + // level_meta != cf_meta.levels.rend(); ++level_meta) { + // // L0 files need to be added in reverse order so we iterate in reverse + // // within a level too + // for (auto file_meta = level_meta->files.rbegin(); + // file_meta != level_meta->files.rend(); ++file_meta) { + // // Add file for ingestion + // } + // } + // + // WARNING: Violating the sequence number ordering requirement will cause + // LSM invariant violations and may lead to incorrect reads or data + // corruption. + // - If you would like to enforce that the ingested files do not overlap + // with each other, you can set `fail_if_not_bottommost_level` to true. + // If ingested files overlap with each other, some file will be placed + // above Lmax, failing the ingestion if the option is set. + // - `write_global_seqno` must be false (sequence numbers cannot be + // reassigned). bool allow_db_generated_files = false; // Controls whether data and metadata blocks (e.g. index, filter) read during @@ -2324,6 +2725,44 @@ struct IngestExternalFileOptions { bool fill_cache = true; }; +// It is valid that files_checksums and files_checksum_func_names are both +// empty (no checksum information is provided for ingestion). Otherwise, +// their sizes should be the same as external_files. The file order should +// be the same in three vectors and guaranteed by the caller. +// Note that, we assume the temperatures of this batch of files to be +// ingested are the same. +struct IngestExternalFileArg { + ColumnFamilyHandle* column_family = nullptr; + std::vector external_files; + IngestExternalFileOptions options; + std::vector files_checksums; + std::vector files_checksum_func_names; + // A hint as to the temperature for *reading* the files to be ingested. + Temperature file_temperature = Temperature::kUnknown; + // EXPERIMENTAL: When specified, existing keys in the given range will be + // cleared atomically as part of the ingestion, where the ingested files are + // logically applied on top of the cleared key range. + // * If both `start` and `limit` are nullptr, the entire column family is + // cleared; however, setting just one bound to nullptr is not yet supported. + // * When a range is specified, all the external files in this batch must + // be contained in that key range. + // * Checks for memtable overlap and possible blocking flush will apply + // to this range (not just the file ranges). + // * Not compatible with ingest_behind=true. + // * When options.snapshot_consistency = false, the range is cleared + // similarly to DeleteFilesInRange, but fails if any files overlap the range + // only partially. + // * It is recommended to use fail_if_not_bottommost_level=true to ensure + // data in the key range is ingested to a single compacted level (the + // last level). (fail_if_not_bottommost_level=false allows overlap between + // the ingested files.) + // * options.snapshot_consistency = true is not yet supported. + // BUG: the upper bound of the range may be interpreted as inclusive or + // exclusive, so it is best not to depend on one or the other until it is + // sorted out. + std::optional atomic_replace_range; +}; + enum TraceFilterType : uint64_t { // Trace all the operations kTraceFilterNone = 0x0, @@ -2409,15 +2848,58 @@ struct CompactionServiceOptionsOverride { // to set it here. std::shared_ptr statistics = nullptr; + // Info Log. If not overriden, default one will be used. + std::shared_ptr info_log = nullptr; + // Only compaction generated SST files use this user defined table properties // collector. std::vector> table_properties_collector_factories; + + // All other options to override. Unknown options will be ignored. + std::unordered_map options_map; }; struct OpenAndCompactOptions { // Allows cancellation of an in-progress compaction. std::atomic* canceled = nullptr; + + // EXPERIMENTAL + // + // Controls whether OpenAndCompact() should attempt to resume from previously + // persisted compaction progress or start fresh. + // + // When `allow_resumption = true`: + // - OpenAndCompact() attempts to resume from previously persisted compaction + // progress stored in `output_directory` + // - During execution, it periodically persists new progress to the same + // directory, allowing future calls to continue from where the previous + // compaction left off. + // - Fallback behavior: If resumption cannot be fulfilled (e.g., due to + // corrupted or missing resume state), the system will attempt to start a + // fresh compaction as a best-effort fallback by cleaning related files in + // the `output_directory` to achieve a clean state. If even the fresh + // compaction cannot be started, a non-OK status will be returned. + // - Important: Resume attempts will be ineffective if the underlying + // conditions that caused the previous OpenAndCompact() failure still + // persist. The same non-OK status will likely be returned unless the root + // cause has been resolved. + // - Progress persistence is sequential and best-effort, triggered upon + // completion of each new output file. If compaction is interrupted while + // creating an output file (before its completion), that partial work will + // need to be redone upon resumption. + // + // When `allow_resumption = false`: + // - OpenAndCompact() starts a fresh compaction from scratch. + // - No progress will be saved during execution, so interruptions require + // starting over completely. + // - CRITICAL REQUIREMENT: The `output_directory` associated MUST be empty + // before calling OpenAndCompact(). Any existing files (including resume + // state or output files from previous runs) may cause correctness errors. + // + // Limitation: Currently incompatible with paranoid_file_checks=true. The + // option is effectively disabled when `paranoid_file_checks` is enabled. + bool allow_resumption = false; }; struct LiveFilesStorageInfoOptions { diff --git a/include/rocksdb/point_lock_bench_tool.h b/include/rocksdb/point_lock_bench_tool.h new file mode 100644 index 000000000000..ed6066c43128 --- /dev/null +++ b/include/rocksdb/point_lock_bench_tool.h @@ -0,0 +1,14 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +int point_lock_bench_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/rate_limiter.h b/include/rocksdb/rate_limiter.h index 51383ba20adc..ede742aba6ac 100644 --- a/include/rocksdb/rate_limiter.h +++ b/include/rocksdb/rate_limiter.h @@ -132,7 +132,7 @@ class RateLimiter { } protected: - Mode GetMode() { return mode_; } + Mode GetMode() const { return mode_; } private: const Mode mode_; diff --git a/include/rocksdb/secondary_cache.h b/include/rocksdb/secondary_cache.h index e8644c45469f..c7b7b6886efb 100644 --- a/include/rocksdb/secondary_cache.h +++ b/include/rocksdb/secondary_cache.h @@ -33,8 +33,8 @@ namespace ROCKSDB_NAMESPACE { // Wait() or SecondaryCache::WaitAll() may be skipped if IsReady() happens to // return true, but (depending on the implementation) IsReady() might never // return true without Wait() or SecondaryCache::WaitAll(). After the handle -// is known ready, calling Value() is required to avoid a memory leak in case -// of a cache hit. +// is known ready, calling Value() and taking ownership is required to avoid +// a memory leak in case of a cache hit. class SecondaryCacheResultHandle { public: virtual ~SecondaryCacheResultHandle() = default; diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h index 0d7eb59499eb..dde34d709d65 100644 --- a/include/rocksdb/slice.h +++ b/include/rocksdb/slice.h @@ -20,10 +20,11 @@ #include #include +#include #include #include #include -#include // RocksDB now requires C++17 support +#include #include "rocksdb/cleanable.h" @@ -129,6 +130,46 @@ class Slice { // Intentionally copyable }; +// A likely more efficient alternative to std::optional. For example, +// an empty key might be distinct from "not specified" (and Slice* as an +// optional is more troublesome to deal with). +class OptSlice { + public: + OptSlice() : slice_(nullptr, SIZE_MAX) {} + /*implicit*/ OptSlice(const Slice& s) : slice_(s) {} + /*implicit*/ OptSlice(const std::string& s) : slice_(s) {} + /*implicit*/ OptSlice(const std::string_view& sv) : slice_(sv) {} + /*implicit*/ OptSlice(const char* c_str) : slice_(c_str) {} + // For easier migrating from APIs uing Slice* as an optional type. + // CAUTION: OptSlice{nullptr} is "no value" while Slice{nullptr} is "empty" + /*implicit*/ OptSlice(std::nullptr_t) : OptSlice() {} + + bool has_value() const noexcept { return slice_.size() != SIZE_MAX; } + explicit operator bool() const noexcept { return has_value(); } + + const Slice& value() const noexcept { + assert(has_value()); + return slice_; + } + const Slice& operator*() const noexcept { return value(); } + const Slice* operator->() const noexcept { return &value(); } + + const Slice* AsPtr() const noexcept { + return has_value() ? &slice_ : nullptr; + } + // Populate from an optional pointer. This is a very explicit conversion + // to minimize risk of bugs as in + // Slice start, limit; + // RangeOpt rng = {&start, &limit}; + // start = ...; // BUG: would not affect rng + static OptSlice CopyFromPtr(const Slice* ptr) { + return ptr ? OptSlice{*ptr} : OptSlice{}; + } + + protected: + Slice slice_; +}; + /** * A Slice that can be pinned with some cleanup tasks, which will be run upon * ::Reset() or object destruction, whichever is invoked first. This can be used diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index f2515d03ffa2..f1ed46a62c50 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -8,9 +8,8 @@ // // Class for specifying user-defined functions which perform a // transformation on a slice. It is not required that every slice -// belong to the domain and/or range of a function. Subclasses should -// define InDomain and InRange to determine which slices are in either -// of these sets respectively. +// belong to the domain of a function. Subclasses should +// define InDomain to determine which slices are in this set. #pragma once @@ -70,10 +69,6 @@ class SliceTransform : public Customizable { // virtual bool InDomain(const Slice& key) const = 0; - // DEPRECATED: This is currently not used and remains here for backward - // compatibility. - virtual bool InRange(const Slice& /*dst*/) const { return false; } - // Returns information on maximum prefix length, if there is one. // If Transform(x).size() == n for some keys and otherwise < n, // should return true and set *len = n. Returning false is safe but diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index d893cb1e2afb..607782715a21 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -15,12 +15,6 @@ #include "rocksdb/types.h" #include "rocksdb/wide_columns.h" -#if defined(__GNUC__) || defined(__clang__) -#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) -#elif _WIN32 -#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated) -#endif - namespace ROCKSDB_NAMESPACE { class Comparator; @@ -88,24 +82,19 @@ class SstFileWriter { // hint that this file pages is not needed every time we write 1MB to the // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be // passed. - // The `skip_filters` option is DEPRECATED and could be removed in the - // future. Use `BlockBasedTableOptions::filter_policy` to control filter - // generation. SstFileWriter(const EnvOptions& env_options, const Options& options, ColumnFamilyHandle* column_family = nullptr, bool invalidate_page_cache = true, - Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL, - bool skip_filters = false) + Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL) : SstFileWriter(env_options, options, options.comparator, column_family, - invalidate_page_cache, io_priority, skip_filters) {} + invalidate_page_cache, io_priority) {} // Deprecated API SstFileWriter(const EnvOptions& env_options, const Options& options, const Comparator* user_comparator, ColumnFamilyHandle* column_family = nullptr, bool invalidate_page_cache = true, - Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL, - bool skip_filters = false); + Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL); ~SstFileWriter(); @@ -113,12 +102,6 @@ class SstFileWriter { Status Open(const std::string& file_path, Temperature temp = Temperature::kUnknown); - // Add a Put key with value to currently opened file (deprecated) - // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) - // key according to the comparator. - // REQUIRES: comparator is *not* timestamp-aware. - ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value); - // Add a Put key with value to currently opened file // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) // key according to the comparator. diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 00b95e8d1fd3..7cecac05f7a1 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -162,6 +162,8 @@ enum Tickers : uint32_t { COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, // If a compaction was canceled in sfm to prevent ENOSPC COMPACTION_CANCELLED, + // Number of compactions aborted via AbortAllCompactions() + COMPACTION_ABORTED, // Number of keys written to the database via the Put and Write call's NUMBER_KEYS_WRITTEN, @@ -301,7 +303,7 @@ enum Tickers : uint32_t { NUMBER_RATE_LIMITER_DRAINS, // BlobDB specific stats - // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB. + // # of Put/PutWithTTL to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_PUT, // # of Write to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_WRITE, @@ -326,12 +328,12 @@ enum Tickers : uint32_t { // # of bytes (keys + value) read from BlobDB. Only applicable to legacy // BlobDB. BLOB_DB_BYTES_READ, - // # of keys written by BlobDB as non-TTL inlined value. Only applicable to - // legacy BlobDB. - BLOB_DB_WRITE_INLINED, - // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy - // BlobDB. - BLOB_DB_WRITE_INLINED_TTL, + // Deprecated: min_blob_size is no longer configurable. Retained to avoid + // shifting enum values. + BLOB_DB_WRITE_INLINED_DEPRECATED, + // Deprecated: min_blob_size is no longer configurable. Retained to avoid + // shifting enum values. + BLOB_DB_WRITE_INLINED_TTL_DEPRECATED, // # of keys written by BlobDB as non-TTL blob value. Only applicable to // legacy BlobDB. BLOB_DB_WRITE_BLOB, @@ -440,13 +442,20 @@ enum Tickers : uint32_t { REMOTE_COMPACT_READ_BYTES, REMOTE_COMPACT_WRITE_BYTES, + // Bytes of output files successfully resumed during compaction + REMOTE_COMPACT_RESUMED_BYTES, + // Tiered storage related statistics HOT_FILE_READ_BYTES, WARM_FILE_READ_BYTES, + COOL_FILE_READ_BYTES, COLD_FILE_READ_BYTES, + ICE_FILE_READ_BYTES, HOT_FILE_READ_COUNT, WARM_FILE_READ_COUNT, + COOL_FILE_READ_COUNT, COLD_FILE_READ_COUNT, + ICE_FILE_READ_COUNT, // Last level and non-last level read statistics LAST_LEVEL_READ_BYTES, @@ -516,14 +525,16 @@ enum Tickers : uint32_t { // Number of FIFO compactions that drop files based on different reasons FIFO_MAX_SIZE_COMPACTIONS, FIFO_TTL_COMPACTIONS, + FIFO_CHANGE_TEMPERATURE_COMPACTIONS, // Number of bytes prefetched during user initiated scan PREFETCH_BYTES, - // Number of prefetched bytes that were actually useful + // Number of prefetched bytes that were actually useful during user initiated + // scan PREFETCH_BYTES_USEFUL, - // Number of FS reads avoided due to scan prefetching + // Number of FS reads avoided due to prefetching during user initiated scan PREFETCH_HITS, // Footer corruption detected when opening an SST file for reading @@ -534,6 +545,44 @@ enum Tickers : uint32_t { FILE_READ_CORRUPTION_RETRY_COUNT, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT, + // Counter for the number of times a WBWI is ingested into the DB. This + // happens when IngestWriteBatchWithIndex() is used and when large + // transaction optimization is enabled through + // TransactionOptions::large_txn_commit_optimize_threshold. + NUMBER_WBWI_INGEST, + + // Failure to load the UDI during SST table open + SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT, + + // MultiScan statistics + // # of Prepare() calls + MULTISCAN_PREPARE_CALLS, + // # of Prepare() calls that failed + MULTISCAN_PREPARE_ERRORS, + // # of data blocks prefetched from storage during MultiScan + MULTISCAN_BLOCKS_PREFETCHED, + // # of blocks found already in cache during MultiScan Prepare + MULTISCAN_BLOCKS_FROM_CACHE, + // Total bytes prefetched during MultiScan + MULTISCAN_PREFETCH_BYTES, + // # of prefetched blocks that were never accessed + MULTISCAN_PREFETCH_BLOCKS_WASTED, + // # of actual I/O requests issued during MultiScan + MULTISCAN_IO_REQUESTS, + // # of non-adjacent blocks coalesced into single I/O (within + // io_coalesce_threshold) + MULTISCAN_IO_COALESCED_NONADJACENT, + // # of seeks that failed validation (out of order, etc.) + MULTISCAN_SEEK_ERRORS, + + // IODispatcher memory limiting statistics + // # of bytes granted to prefetch requests + PREFETCH_MEMORY_BYTES_GRANTED, + // # of bytes released from prefetch memory + PREFETCH_MEMORY_BYTES_RELEASED, + // # of prefetch requests that were blocked waiting for memory + PREFETCH_MEMORY_REQUESTS_BLOCKED, + TICKER_ENUM_MAX }; @@ -612,8 +661,7 @@ enum Histograms : uint32_t { BLOB_DB_KEY_SIZE, // Size of values written to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_VALUE_SIZE, - // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy - // BlobDB. + // BlobDB Put/PutWithTTL/Write latency. Only applicable to legacy BlobDB. BLOB_DB_WRITE_MICROS, // BlobDB Get latency. Only applicable to legacy BlobDB. BLOB_DB_GET_MICROS, @@ -657,16 +705,31 @@ enum Histograms : uint32_t { ASYNC_READ_BYTES, POLL_WAIT_MICROS, + // Number of bytes for RocksDB's prefetching (as opposed to file + // system's prefetch) on SST file during compaction read + COMPACTION_PREFETCH_BYTES, + // Number of prefetched bytes discarded by RocksDB. PREFETCHED_BYTES_DISCARDED, // Wait time for aborting async read in FilePrefetchBuffer destructor ASYNC_PREFETCH_ABORT_MICROS, - // Number of bytes read for RocksDB's prefetching contents (as opposed to file + // Number of bytes for RocksDB's prefetching contents (as opposed to file // system's prefetch) from the end of SST table during block based table open TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + // Number of operations per transaction. + NUM_OP_PER_TRANSACTION, + + // MultiScan Prefill iterator Prepare cost + MULTISCAN_PREPARE_ITERATORS, + + // Total Prepare() latency for MultiScan + MULTISCAN_PREPARE_MICROS, + // Distribution of blocks prefetched per MultiScan Prepare() + MULTISCAN_BLOCKS_PER_PREPARE, + HISTOGRAM_ENUM_MAX }; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 82597239fff7..c3eeb082c3ed 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -115,6 +115,9 @@ class Status { kIOFenced = 14, kMergeOperatorFailed = 15, kMergeOperandThresholdExceeded = 16, + kPrefetchLimitReached = 17, + kNotExpectedCodePath = 18, + kCompactionAborted = 19, kMaxSubCode }; @@ -316,12 +319,21 @@ class Status { return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2); } + static Status LockLimit() { return Status(kAborted, kLockLimit); } + + static Status PrefetchLimitReached() { + return Status(kIncomplete, kPrefetchLimitReached); + } + // Returns true iff the status indicates success. bool ok() const { MarkChecked(); return code() == kOk; } + // Assert the status is OK in debug mode + void AssertOK() const { assert(ok()); } + // Returns true iff the status indicates success *with* something // overwritten bool IsOkOverwritten() const { @@ -472,6 +484,13 @@ class Status { return (code() == kIncomplete) && (subcode() == kManualCompactionPaused); } + // Returns true iff the status indicates compaction aborted. This + // is caused by a call to AbortAllCompactions + bool IsCompactionAborted() const { + MarkChecked(); + return (code() == kIncomplete) && (subcode() == kCompactionAborted); + } + // Returns true iff the status indicates a TxnNotPrepared error. bool IsTxnNotPrepared() const { MarkChecked(); @@ -484,6 +503,13 @@ class Status { return (code() == kIOError) && (subcode() == kIOFenced); } + // Returns true iff the status indicates prefetch limit reached during + // MultiScan. + bool IsPrefetchLimitReached() const { + MarkChecked(); + return (code() == kIncomplete) && (subcode() == kPrefetchLimitReached); + } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index e1f76fcd4632..3485c41f8079 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -44,6 +44,7 @@ class TableReader; class WritableFileWriter; struct ConfigOptions; struct EnvOptions; +class UserDefinedIndexFactory; // Types of checksums to use for checking integrity of logical blocks within // files. All checksums currently use 32 bits of checking power (1 in 4B @@ -126,7 +127,15 @@ struct CacheUsageOptions { }; // Configures how SST files using the block-based table format (standard) -// are written and read. +// are written and read. With few exceptions, each option only affects either +// (a) how new SST files are written, or (b) how SST files are read. If an +// option seems to affect how the SST file is constructed, e.g. format_version, +// that option *ONLY* has an effect at construction time. Contrast this with +// options like the various `cache` and `pin` options, that only affect +// in-memory and IO behavior at read time. In general, any version of RocksDB +// able to read the full key-value and indexing data in the SST file will read +// it as written regardless of current options for writing new files. See +// filter_policy regarding filters. // // Except as specifically noted, all options here are "mutable" using // SetOptions(), with the caveat that only new table builders and new table @@ -254,6 +263,21 @@ struct BlockBasedTableOptions { IndexType index_type = kBinarySearch; + // The search algorithm used when seeking to entries in the index block. + enum BlockSearchType : char { + // Standard binary search + kBinary = 0x00, + // Interpolation search, which may be better suited for uniformly + // distributed keys. This will only be applicable if the comparator is the + // byte-wise comparator. Avoid using + // IndexShorteningMode::kShortenSeparatorsAndSuccessor as shortening the + // succesor can skew the end key and make interpolation search significantly + // less performant. + kInterpolation = 0x01, + }; + + BlockSearchType index_block_search_type = kBinary; + // The index type that will be used for the data block. enum DataBlockIndexType : char { kDataBlockBinarySearch = 0, // traditional block type @@ -431,10 +455,13 @@ struct BlockBasedTableOptions { // versions of RocksDB able to read partitioned filters are able to read // decoupled partitioned filters.) // - // decouple_partitioned_filters = false is the original behavior, because of - // limitations in the initial implementation, and the new behavior - // decouple_partitioned_filters = true is expected to become the new default. - bool decouple_partitioned_filters = false; + // decouple_partitioned_filters = true is the new default. This option is now + // DEPRECATED and might be ignored and/or removed in a future release. + // + // NOTE: decouple_partitioned_filters = false with partition_filters = true + // disables parallel compression (CompressionOptions::parallel_threads + // sanitized to 1). + bool decouple_partitioned_filters = true; // Option to generate Bloom/Ribbon filters that minimize memory // internal fragmentation. @@ -480,8 +507,29 @@ struct BlockBasedTableOptions { // If non-nullptr, use the specified filter policy to reduce disk reads. // Many applications will benefit from passing the result of // NewBloomFilterPolicy() here. + // + // Because filters only impact performance and are not data-critical, an + // SST file can be opened and used without filters if (a) the filter + // policy name or schema is unrecognized, or (b) filter_policy is nullptr. + // See filter_policy regarding filters. std::shared_ptr filter_policy = nullptr; + // EXPERIMENTAL + // + // If non-nullptr, use the specified factory to build user-defined index. + // This allows users to define their own index format and build the index + // during table building. + // + // NOTE: UserDefinedIndexFactory currently disables parallel compression + // (CompressionOptions::parallel_threads sanitized to 1). + std::shared_ptr user_defined_index_factory = nullptr; + + // EXPERIMENTAL + // + // Return an error Status if a user_defined_index_factory is configured, + // but there's no corresponding UDI block in the SST file being opened. + bool fail_if_no_udi_on_open = false; + // If true, place whole keys in the filter (not just prefixes). // This must generally be true for gets to be efficient. bool whole_key_filtering = true; @@ -524,13 +572,10 @@ struct BlockBasedTableOptions { // Default: 0 (disabled) uint32_t read_amp_bytes_per_bit = 0; - // We currently have these versions: - // 0 -- This version can be read by really old RocksDB's. Doesn't support - // changing checksum type (default is CRC32). - // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default - // checksum, like xxHash. It is written by RocksDB when - // BlockBasedTableOptions::checksum is something other than kCRC32c. (version - // 0 is silently upconverted) + // We currently have these format versions: + // 0 - 1 -- No longer supported. Attempting to read files with these format + // versions will return an error. To upgrade, load the data with RocksDB + // >= 4.6.0 and < 11.0.0, then run a full compaction. // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you // don't plan to run RocksDB before version 3.10, you should probably use @@ -553,6 +598,10 @@ struct BlockBasedTableOptions { // misplaced within or between files is as likely to fail checksum // verification as random corruption. Also checksum-protects SST footer. // Can be read by RocksDB versions >= 8.6.0. + // 7 -- Support for custom compression algorithms with a CompressionManager + // using a non-built-in CompatibilityName(). See `compression_manager` in + // ColumnFamilyOptions. Also changes the format of TableProperties field + // `compression_name`. Can be read by RocksDB versions >= 10.4.0. // // Using the default setting of format_version is strongly recommended, so // that available enhancements are adopted eventually and automatically. The @@ -560,7 +609,7 @@ struct BlockBasedTableOptions { // validation and sufficient time and number of releases have elapsed // (6 months recommended) to ensure a clean downgrade/revert path for users // who might only upgrade a few times per year. - uint32_t format_version = 6; + uint32_t format_version = 7; // Store index blocks on disk in compressed format. Changing this option to // false will avoid the overhead of decompression if index blocks are evicted @@ -570,6 +619,30 @@ struct BlockBasedTableOptions { // Align data blocks on lesser of page size and block size bool block_align = false; + // Align data blocks on super block alignment. Avoid a data block split across + // super block boundaries. Works with/without compression. + // + // Here a "super block" refers to an aligned unit of underlying Filesystem + // storage for which there is an extra cost when a random read involves two + // such super blocks instead of just one. Configuring that size here suggests + // inserting padding in the SST file to avoid a single SST block splitting + // across two super blocks. Only power-of-two sizes are supported. See also + // super_block_alignment_space_overhead_ratio. Default to 0, which means super + // block alignment is disabled. + // + // Super block alignment size. Default to 0, which means super block alignment + // is disabled. If it is enabled, it needs to be a power of 2 and higher than + // block size. + size_t super_block_alignment_size = 0; + + // This option constrols the storage space overhead of super block alignment. + // It is used to calculate the max padding size allowed for super block + // alignment. It is calculated in this way. If super_block_alignment_size is + // 2MB, and super_block_alignment_overhead_ratio is 128, then the max padding + // size allowed for super block alignment is 2MB / 128 = 16KB. + // Note that, when it is set to 0, super block alignment is disabled. + size_t super_block_alignment_space_overhead_ratio = 128; + // This enum allows trading off increased index size for improved iterator // seek performance in some situations, particularly when block cache is // disabled (ReadOptions::fill_cache = false) and direct IO is diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 00e448ba7d7f..6bac922761f9 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -76,6 +76,7 @@ struct TablePropertiesNames { static const std::string kTailStartOffset; static const std::string kUserDefinedTimestampsPersisted; static const std::string kKeyLargestSeqno; + static const std::string kKeySmallestSeqno; }; // `TablePropertiesCollector` provides the mechanism for users to collect @@ -109,6 +110,10 @@ class TablePropertiesCollector { // table. // @params key the user key that is inserted into the table. // @params value the value that is inserted into the table. + // @params file_size the current file size. For BlockBasedTable, this + // includes all the data blocks written so far, upto but not including + // the current block being built. With parallel compression, data + // blocks are written async so it depends on the compression progress. virtual Status AddUserKey(const Slice& key, const Slice& value, EntryType /*type*/, SequenceNumber /*seq*/, uint64_t /*file_size*/) { @@ -143,7 +148,7 @@ class TablePropertiesCollector { // The name of the properties collector can be used for debugging purpose. virtual const char* Name() const = 0; - // EXPERIMENTAL Return whether the output file should be further compacted + // Return whether the output file should be further compacted virtual bool NeedCompact() const { return false; } // For internal use only. @@ -216,6 +221,8 @@ struct TableProperties { uint64_t orig_file_number = 0; // the total size of all data blocks. uint64_t data_size = 0; + // the total uncompressed size of all data blocks (since RocksDB 10.7) + uint64_t uncompressed_data_size = 0; // the size of index block. uint64_t index_size = 0; // Total number of index partitions if kTwoLevelIndexSearch is used @@ -303,6 +310,16 @@ struct TableProperties { // table is empty). uint64_t key_largest_seqno = UINT64_MAX; + bool HasKeyLargestSeqno() const { return key_largest_seqno != UINT64_MAX; } + + // The smallest sequence number of keys in this file. + // UINT64_MAX means unknown. + // Only written to properties block if known (should be known unless the + // table is empty). + uint64_t key_smallest_seqno = UINT64_MAX; + + bool HasKeySmallestSeqno() const { return key_smallest_seqno != UINT64_MAX; } + // DB identity // db_id is an identifier generated the first time the DB is created // If DB identity is unset or unassigned, `db_id` will be an empty string. @@ -344,7 +361,20 @@ struct TableProperties { // {collector_name[1]},{collector_name[2]},{collector_name[3]} .. std::string property_collectors_names; - // The compression algo used to compress the SST files. + // Identifies the compression algorithm or schema used in the file. + // Specifically: + // * For format_version < 7, it is one of several names for built-in + // compression types. Because of how some previous versions of RocksDB + // behave, this must be set to "ZSTD" if any blocks are compressed + // with zstd and must NOT be set to "NoCompression" if any blocks are + // compressed. + // * For format_version >= 7, the format is + // ;; + // where is the CompatibilityName() of the + // CompressionManager used for the file, or empty if compression was + // disabled; represents a sorted set of + // CompressionType values used in the file other than kNoCompression, each + // as 2-digit hex, e.g. 04 for LZ$, 07 for ZSTD, etc. std::string compression_name; // Compression options used to compress the SST files. diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h index 880b0bd4fa20..07c872c0e9b5 100644 --- a/include/rocksdb/thread_status.h +++ b/include/rocksdb/thread_status.h @@ -22,24 +22,16 @@ #include "rocksdb/rocksdb_namespace.h" -#if !defined(NROCKSDB_THREAD_STATUS) -#define ROCKSDB_USING_THREAD_STATUS -#endif - namespace ROCKSDB_NAMESPACE { -// TODO(yhchiang): remove this function once c++14 is available -// as std::max will be able to cover this. -// Current MS compiler does not support constexpr -template -struct constexpr_max { - static const int result = (A > B) ? A : B; -}; - // A structure that describes the current status of a thread. // The status of active threads can be fetched using // ROCKSDB_NAMESPACE::GetThreadList(). struct ThreadStatus { + // Whether RocksDB was built with !NROCKSDB_THREAD_STATUS for + // ROCKSDB_NAMESPACE::GetThreadList() to be supported. + static const bool kEnabled; + // The type of a thread. enum ThreadType : int { HIGH_PRIORITY = 0, // RocksDB BG thread in high-pri thread pool @@ -64,7 +56,7 @@ struct ThreadStatus { OP_VERIFY_FILE_CHECKSUMS, OP_GETENTITY, OP_MULTIGETENTITY, - OP_READ_MANIFEST, + OP_GET_FILE_CHECKSUMS_FROM_CURRENT_MANIFEST, NUM_OP_TYPES }; @@ -102,8 +94,8 @@ struct ThreadStatus { // The maximum number of properties of an operation. // This number should be set to the biggest NUM_XXX_PROPERTIES. - static const int kNumOperationProperties = - constexpr_max::result; + static constexpr int kNumOperationProperties = + std::max(int{NUM_COMPACTION_PROPERTIES}, int{NUM_FLUSH_PROPERTIES}); // The type used to refer to a thread state. // A state describes lower-level action of a thread diff --git a/include/rocksdb/tool_hooks.h b/include/rocksdb/tool_hooks.h index b31780c032f8..a92abde67356 100644 --- a/include/rocksdb/tool_hooks.h +++ b/include/rocksdb/tool_hooks.h @@ -30,18 +30,21 @@ class ToolHooks { ToolHooks() = default; virtual ~ToolHooks() = default; virtual Status Open(const Options& db_options, const std::string& name, - DB** dbptr) = 0; + std::unique_ptr* dbptr) = 0; virtual Status Open( const DBOptions& db_options, const std::string& name, const std::vector& column_families, - std::vector* handles, DB** dbptr) = 0; + std::vector* handles, + std::unique_ptr* dbptr) = 0; virtual Status OpenForReadOnly(const Options& options, - const std::string& name, DB** dbptr, + const std::string& name, + std::unique_ptr* dbptr, bool error_if_wal_file_exists) = 0; virtual Status OpenForReadOnly( const Options& options, const std::string& name, const std::vector& column_families, - std::vector* handles, DB** dbptr) = 0; + std::vector* handles, + std::unique_ptr* dbptr) = 0; virtual Status OpenTransactionDB(const Options& db_options, const TransactionDBOptions& txn_db_options, const std::string& dbname, @@ -62,13 +65,14 @@ class ToolHooks { virtual Status OpenAsSecondary(const Options& options, const std::string& name, const std::string& secondary_path, - DB** dbptr) = 0; + std::unique_ptr* dbptr) = 0; virtual Status OpenAsFollower(const Options& options, const std::string& name, const std::string& leader_path, std::unique_ptr* dbptr) = 0; virtual Status Open(const Options& options, const blob_db::BlobDBOptions& bdb_options, const std::string& dbname, blob_db::BlobDB** blob_db) = 0; + virtual void Exit(int status) = 0; }; class DefaultHooks : public ToolHooks { @@ -76,18 +80,21 @@ class DefaultHooks : public ToolHooks { DefaultHooks() = default; ~DefaultHooks() override = default; virtual Status Open(const Options& db_options, const std::string& name, - DB** dbptr) override; + std::unique_ptr* dbptr) override; virtual Status Open( const DBOptions& db_options, const std::string& name, const std::vector& column_families, - std::vector* handles, DB** dbptr) override; + std::vector* handles, + std::unique_ptr* dbptr) override; virtual Status OpenForReadOnly(const Options& options, - const std::string& name, DB** dbptr, + const std::string& name, + std::unique_ptr* dbptr, bool error_if_wal_file_exists) override; virtual Status OpenForReadOnly( const Options& options, const std::string& name, const std::vector& column_families, - std::vector* handles, DB** dbptr) override; + std::vector* handles, + std::unique_ptr* dbptr) override; virtual Status OpenTransactionDB(const Options& db_options, const TransactionDBOptions& txn_db_options, const std::string& dbname, @@ -109,7 +116,7 @@ class DefaultHooks : public ToolHooks { virtual Status OpenAsSecondary(const Options& options, const std::string& name, const std::string& secondary_path, - DB** dbptr) override; + std::unique_ptr* dbptr) override; virtual Status OpenAsFollower(const Options& options, const std::string& name, const std::string& leader_path, std::unique_ptr* dbptr) override; @@ -117,6 +124,8 @@ class DefaultHooks : public ToolHooks { const blob_db::BlobDBOptions& bdb_options, const std::string& dbname, blob_db::BlobDB** blob_db) override; + + virtual void Exit(int status) override { exit(status); } }; extern DefaultHooks defaultHooks; diff --git a/include/rocksdb/trace_record.h b/include/rocksdb/trace_record.h index 8f9c3ee2f0f5..d321f538745d 100644 --- a/include/rocksdb/trace_record.h +++ b/include/rocksdb/trace_record.h @@ -5,6 +5,7 @@ #pragma once +#include #include #include #include diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h index 368736cbd097..982f497fdf55 100644 --- a/include/rocksdb/types.h +++ b/include/rocksdb/types.h @@ -53,7 +53,8 @@ enum FileType { kMetaDatabase, kIdentityFile, kOptionsFile, - kBlobFile + kBlobFile, + kCompactionProgressFile }; // User-oriented representation of internal key types. @@ -118,7 +119,11 @@ enum class Temperature : uint8_t { kUnknown = 0, kHot = 0x04, kWarm = 0x08, + kCool = 0x0A, kCold = 0x0C, + kIce = 0x10, + // XXX: this is mis-named. It is instead an invalid temperature beyond the + // rest kLastTemperature, }; diff --git a/include/rocksdb/unique_id.h b/include/rocksdb/unique_id.h index eb0c778266cb..3c0c0eb5b1bf 100644 --- a/include/rocksdb/unique_id.h +++ b/include/rocksdb/unique_id.h @@ -33,8 +33,8 @@ namespace ROCKSDB_NAMESPACE { // And assuming one generates many SST files in the lifetime of each process, // the probability of ID collisions is much "better than random"; see // https://github.com/pdillinger/unique_id -Status GetUniqueIdFromTableProperties(const TableProperties &props, - std::string *out_id); +Status GetUniqueIdFromTableProperties(const TableProperties& props, + std::string* out_id); // Computes a 192-bit (24 binary char) stable, universally unique ID // with an extra 64 bits of uniqueness compared to the standard ID. It is only @@ -44,12 +44,12 @@ Status GetUniqueIdFromTableProperties(const TableProperties &props, // example above would expect a global file ID collision every 4 days with // 128-bit IDs (using some worst-case assumptions about process lifetime). // It's 10^17 years with 192-bit IDs. -Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props, - std::string *out_id); +Status GetExtendedUniqueIdFromTableProperties(const TableProperties& props, + std::string* out_id); // Converts a binary string (unique id) to hexadecimal, with each 64 bits // separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B // Also works on unique id prefix. -std::string UniqueIdToHumanString(const std::string &id); +std::string UniqueIdToHumanString(const std::string& id); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h index e40033cae44a..9a52ee539db2 100644 --- a/include/rocksdb/universal_compaction.h +++ b/include/rocksdb/universal_compaction.h @@ -111,6 +111,24 @@ class CompactionOptionsUniversal { // Default: false bool incremental; + // If true, auto universal compaction picking will adjust to minimize locking + // of input files when bottom priority compactions are waiting to run. This + // can increase the likelihood of existing L0s being selected for compaction, + // thereby improving write stall and reducing read regression. It may increase + // the overrall write amplification and compaction load on low priority + // threads. + // + // Default: true (enabled) + // + // This options does not apply to manual compactions. + // + // This option is temporary in case turning on this feature causes problems + // and users need to undo it quickly. This option is planned for removal in + // the near future with default value set to true. + // + // Dynamically changeable through the SetOptions() API. + bool reduce_file_locking; + // Default set of parameters CompactionOptionsUniversal() : size_ratio(1), @@ -121,11 +139,10 @@ class CompactionOptionsUniversal { max_read_amp(-1), stop_style(kCompactionStopStyleTotalSize), allow_trivial_move(false), - incremental(false) {} + incremental(false), + reduce_file_locking(true) {} -#if __cplusplus >= 202002L bool operator==(const CompactionOptionsUniversal& rhs) const = default; -#endif }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/user_defined_index.h b/include/rocksdb/user_defined_index.h new file mode 100644 index 000000000000..395f9fbf3530 --- /dev/null +++ b/include/rocksdb/user_defined_index.h @@ -0,0 +1,187 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// ***************************************************************** +// EXPERIMENTAL - subject to change while under development +// ***************************************************************** + +#pragma once + +#include + +#include "rocksdb/advanced_iterator.h" +#include "rocksdb/customizable.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// Prefix for user-defined index block names +inline const std::string kUserDefinedIndexPrefix = + "rocksdb.user_defined_index."; + +// This is a public API for user-defined index builders. +// It allows users to define their own index format and build custom +// indexes during table building. Currently, only a monolithic index +// block is supported (no partitioned index). +// +// This is currently supported only for a restricted set of use cases. The +// CF must be ingest only, and only files containing Puts generated by +// SstFileWriter are supported. + +// The interface for building user-defined index. +class UserDefinedIndexBuilder { + public: + // Right now, we only support Puts. In the future, we may support merges, + // deletions etc. + enum ValueType { + kValue, + kTypeMax, + }; + + // File offset and size of the data block + struct BlockHandle { + uint64_t offset; + uint64_t size; + }; + + virtual ~UserDefinedIndexBuilder() = default; + + // Add a new index entry to index block. The key for the new index entry + // should be >= last_key_in_current_block and < first_key_in_next_block. + // The previous index entry key and the new index entry key cover + // all the keys in the data block associated with the new index entry. + // + // The last_key_in_current_block and first_key_in_next_block will be user + // keys, i.e the user key string, and optionally the user timestamp if one + // is configured, without a sequence number suffix. + // + // Called before the OnKeyAdded() call for first_key_in_next_block. + // @last_key_in_current_block: The last key in the current data block + // @first_key_in_next_block: it will be nullptr if the entry being added is + // the last one in the table + // @block_handle: offset/size of the data block referenced by this index + // entry. This should be stored along with the index entry + // key + // @separator_scratch: a scratch buffer to back a computed separator between + // those, as needed. May be modified on each call. + // @return: the key or separator stored in the index, which could be + // last_key_in_current_block or a computed separator backed by + // separator_scratch. + virtual Slice AddIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle, + std::string* separator_scratch) = 0; + + // This method will be called whenever a key is added. The subclasses may + // override OnKeyAdded() if they need to collect additional information. + // The type argument indicates whether the value is a full value or partial. + // At the moment, only full values are supported. + // + // The key will be a user key. RocksDB guarantees that there will only be + // one entry for each key in the file/index. + virtual void OnKeyAdded(const Slice& /*key*/, ValueType /*type*/, + const Slice& /*value*/) {} + + // Finish building the index. + // Returns a Status and the serialized index contents. + // The memory backing the contents should not be freed until this builder + // object is destructed. + virtual Status Finish(Slice* index_contents) = 0; +}; + +// The interface for iterating the user defined index. This will be +// instantiated and used by a scan to iterate through the index entries +// covered by the scan. +class UserDefinedIndexIterator { + public: + virtual ~UserDefinedIndexIterator() = default; + + // Prepare the iterator for a series of scans. The iterator should use + // this as an opportunity to do any prefetching and buffering of results. + virtual void Prepare(const ScanOptions scan_opts[], size_t num_opts) = 0; + + // Given the target key, position the index iterator at the index entry + // with the smallest key >= target. The result must be updated with the + // index key, and the bound_check_result. The bound_check_result should + // be set to kOutOfBound if no block satisfies the target key and + // termination criteria, kInbound if the data block is definitely fully + // within bounds, or kUnknown if the data block could be partially + // within bounds. + // The UDI implementation needs to be careful about returning kOutOfBound. + // If a limit key is specified in ScanOptions, an implementation that + // does not store the first key in the block for the corresponding index + // entry cannot reliably determine if the block is out of bounds. It must + // compare against the previous index key to determine if the current block + // is out of bounds w.r.t the limit. Other termination criteria (specified + // in property_bag) may cause the scan to terminate earlier, in which case + // kOutOfBound can be returned earlier. + virtual Status SeekAndGetResult(const Slice& target, + IterateResult* result) = 0; + + // Advance to the next index entry. The result must be populated similar + // to SeekAndGetResult. + virtual Status NextAndGetResult(IterateResult* result) = 0; + + // Return the BlockHandle in the current index entry + virtual UserDefinedIndexBuilder::BlockHandle value() = 0; +}; + +// A reader interface for the user defined index +class UserDefinedIndexReader { + public: + virtual ~UserDefinedIndexReader() = default; + + // Allocate an iterator that will be used by RocksDB to perform scans + virtual std::unique_ptr NewIterator( + const ReadOptions& read_options) = 0; + + // The memory usage of the index, including the size of the raw contents and + // any other heap data structures allocated by the reader + virtual size_t ApproximateMemoryUsage() const = 0; +}; + +// Options for user defined index +struct UserDefinedIndexOption { + const Comparator* comparator = BytewiseComparator(); +}; + +// Factory for creating user-defined index builders. +class UserDefinedIndexFactory : public Customizable { + public: + virtual ~UserDefinedIndexFactory() = default; + + static const char* Type() { return "UserDefinedIndexFactory"; } + + static Status CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* factory); + + // Create a new builder for user-defined index. + virtual UserDefinedIndexBuilder* NewBuilder() const = 0; + + // Create a new user defined index reader given the contents of the index + // block + virtual std::unique_ptr NewReader( + Slice& index_block) const = 0; + + // New API for allowing customized comparator + virtual Status NewBuilder( + const UserDefinedIndexOption& /*option*/, + std::unique_ptr& builder) const { + builder.reset(NewBuilder()); + return Status::OK(); + }; + + virtual Status NewReader( + const UserDefinedIndexOption& /*option*/, Slice& index_block, + std::unique_ptr& reader) const { + reader = NewReader(index_block); + return Status::OK(); + }; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/utilities/backup_engine.h b/include/rocksdb/utilities/backup_engine.h index 045fdb06aa49..1961691be15e 100644 --- a/include/rocksdb/utilities/backup_engine.h +++ b/include/rocksdb/utilities/backup_engine.h @@ -621,7 +621,14 @@ class BackupEngineAppendOnlyBase { // The backup will stop ASAP and the call to CreateNewBackup will // return Status::Incomplete(). It will not clean up after itself, but // the state will remain consistent. The state will be cleaned up the - // next time you call CreateNewBackup or GarbageCollect. + // next time you call CreateNewBackup or GarbageCollect for the same backup + // directory on a new BackupEngine object. + // + // NOTE: This is a one-way operation. Once StopBackup() is called on a + // BackupEngine instance, all subsequent backup requests (CreateNewBackup, + // CreateNewBackupWithMetadata) will fail with Status::Incomplete(). + // To create new backups after calling StopBackup(), you must open a new + // BackupEngine instance. virtual void StopBackup() = 0; // Will delete any files left over from incomplete creation or deletion of diff --git a/include/rocksdb/utilities/cache_dump_load.h b/include/rocksdb/utilities/cache_dump_load.h index 8f41839cd9de..ca2ce5ae11aa 100644 --- a/include/rocksdb/utilities/cache_dump_load.h +++ b/include/rocksdb/utilities/cache_dump_load.h @@ -90,7 +90,7 @@ class CacheDumper { public: virtual ~CacheDumper() = default; // Only dump the blocks in the block cache that belong to the DBs in this list - virtual Status SetDumpFilter(std::vector db_list) { + virtual Status SetDumpFilter(const std::vector& db_list) { (void)db_list; return Status::NotSupported("SetDumpFilter is not supported"); } diff --git a/include/rocksdb/utilities/db_ttl.h b/include/rocksdb/utilities/db_ttl.h index 12f5cbac0f75..bccce8ddb14f 100644 --- a/include/rocksdb/utilities/db_ttl.h +++ b/include/rocksdb/utilities/db_ttl.h @@ -63,8 +63,10 @@ class DBWithTTL : public StackableDB { virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0; + virtual Status GetTtl(ColumnFamilyHandle* h, int32_t* ttl) = 0; + protected: - explicit DBWithTTL(DB* db) : StackableDB(db) {} + explicit DBWithTTL(std::unique_ptr&& db) : StackableDB(std::move(db)) {} }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h index 1cbc7daf84cc..57968ad15e10 100644 --- a/include/rocksdb/utilities/debug.h +++ b/include/rocksdb/utilities/debug.h @@ -33,12 +33,12 @@ struct KeyVersion { // copied to memory, if the range covers too many keys, the memory usage // may be huge. `max_num_ikeys` can be used to cap the memory usage. // The result is inserted into the provided vector, `key_versions`. -Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, +Status GetAllKeyVersions(DB* db, OptSlice begin_key, OptSlice end_key, size_t max_num_ikeys, std::vector* key_versions); -Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, - Slice end_key, size_t max_num_ikeys, +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, OptSlice begin_key, + OptSlice end_key, size_t max_num_ikeys, std::vector* key_versions); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/utilities/env_mirror.h b/include/rocksdb/utilities/env_mirror.h index 40c04095bde9..68cce77dad4e 100644 --- a/include/rocksdb/utilities/env_mirror.h +++ b/include/rocksdb/utilities/env_mirror.h @@ -68,7 +68,7 @@ class EnvMirror : public EnvWrapper { assert(as == bs); return as; } -#if defined(_MSC_VER) +#if defined(_MSC_VER) // ODR-SAFE #pragma warning(push) // logical operation on address of string constant #pragma warning(disable : 4130) @@ -87,7 +87,7 @@ class EnvMirror : public EnvWrapper { *r = ar; return as; } -#if defined(_MSC_VER) +#if defined(_MSC_VER) // ODR-SAFE #pragma warning(pop) #endif Status DeleteFile(const std::string& f) override { diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h index e0a1f06a7c8a..aacf9d3e9338 100644 --- a/include/rocksdb/utilities/ldb_cmd.h +++ b/include/rocksdb/utilities/ldb_cmd.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,7 @@ #include "rocksdb/slice.h" #include "rocksdb/utilities/db_ttl.h" #include "rocksdb/utilities/ldb_cmd_execute_result.h" +#include "rocksdb/utilities/transaction_db.h" namespace ROCKSDB_NAMESPACE { @@ -42,6 +44,8 @@ class LDBCommand { static const std::string ARG_TTL; static const std::string ARG_TTL_START; static const std::string ARG_TTL_END; + static const std::string ARG_USE_TXN; + static const std::string ARG_TXN_WRITE_POLICY; static const std::string ARG_TIMESTAMP; static const std::string ARG_TRY_LOAD_OPTIONS; static const std::string ARG_IGNORE_UNKNOWN_OPTIONS; @@ -71,7 +75,6 @@ class LDBCommand { static const std::string ARG_BLOB_FILE_STARTING_LEVEL; static const std::string ARG_PREPOPULATE_BLOB_CACHE; static const std::string ARG_DECODE_BLOB_INDEX; - static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS; static const std::string ARG_READ_TIMESTAMP; static const std::string ARG_GET_WRITE_UNIX_TIME; @@ -163,8 +166,9 @@ class LDBCommand { std::string secondary_path_; std::string leader_path_; std::string column_family_name_; - DB* db_; + std::unique_ptr db_; DBWithTTL* db_ttl_; + TransactionDB* db_txn_; std::map cf_handles_; std::map ucmps_; @@ -183,6 +187,13 @@ class LDBCommand { /** If true, the value is treated as timestamp suffixed */ bool is_db_ttl_; + /** If true, open the DB as TransactionDB */ + bool is_db_txn_; + + /** Transaction write policy (0=WRITE_COMMITTED, 1=WRITE_PREPARED, + * 2=WRITE_UNPREPARED) */ + int txn_write_policy_; + // If true, the kvs are output with their insert/modify timestamp in a ttl db bool timestamp_; diff --git a/include/rocksdb/utilities/ldb_cmd_execute_result.h b/include/rocksdb/utilities/ldb_cmd_execute_result.h index 57bac334682b..2af07eeba55f 100644 --- a/include/rocksdb/utilities/ldb_cmd_execute_result.h +++ b/include/rocksdb/utilities/ldb_cmd_execute_result.h @@ -9,10 +9,6 @@ #include "rocksdb/rocksdb_namespace.h" -#ifdef FAILED -#undef FAILED -#endif - namespace ROCKSDB_NAMESPACE { class LDBCommandExecuteResult { diff --git a/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/include/rocksdb/utilities/lua/rocks_lua_custom_library.h deleted file mode 100644 index f617da02bea6..000000000000 --- a/include/rocksdb/utilities/lua/rocks_lua_custom_library.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2016, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once -#ifdef LUA - -// lua headers -extern "C" { -#include -#include -#include -} - -namespace ROCKSDB_NAMESPACE { -namespace lua { -// A class that used to define custom C Library that is callable -// from Lua script -class RocksLuaCustomLibrary { - public: - virtual ~RocksLuaCustomLibrary() {} - // The name of the C library. This name will also be used as the table - // (namespace) in Lua that contains the C library. - virtual const char* Name() const = 0; - - // Returns a "static const struct luaL_Reg[]", which includes a list of - // C functions. Note that the last entry of this static array must be - // {nullptr, nullptr} as required by Lua. - // - // More details about how to implement Lua C libraries can be found - // in the official Lua document http://www.lua.org/pil/26.2.html - virtual const struct luaL_Reg* Lib() const = 0; - - // A function that will be called right after the library has been created - // and pushed on the top of the lua_State. This custom setup function - // allows developers to put additional table or constant values inside - // the same table / namespace. - virtual void CustomSetup(lua_State* /*L*/) const {} -}; -} // namespace lua -} // namespace ROCKSDB_NAMESPACE -#endif // LUA diff --git a/include/rocksdb/utilities/lua/rocks_lua_util.h b/include/rocksdb/utilities/lua/rocks_lua_util.h deleted file mode 100644 index 3427b65ef674..000000000000 --- a/include/rocksdb/utilities/lua/rocks_lua_util.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2016, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once -// lua headers -extern "C" { -#include -#include -#include -} - -#ifdef LUA -#include -#include - -#include "rocksdb/utilities/lua/rocks_lua_custom_library.h" - -namespace ROCKSDB_NAMESPACE { -namespace lua { -class LuaStateWrapper { - public: - explicit LuaStateWrapper(const std::string& lua_script) { - lua_state_ = luaL_newstate(); - Init(lua_script, {}); - } - LuaStateWrapper( - const std::string& lua_script, - const std::vector>& libraries) { - lua_state_ = luaL_newstate(); - Init(lua_script, libraries); - } - lua_State* GetLuaState() const { return lua_state_; } - ~LuaStateWrapper() { lua_close(lua_state_); } - - private: - void Init( - const std::string& lua_script, - const std::vector>& libraries) { - if (lua_state_) { - luaL_openlibs(lua_state_); - for (const auto& library : libraries) { - luaL_openlib(lua_state_, library->Name(), library->Lib(), 0); - library->CustomSetup(lua_state_); - } - luaL_dostring(lua_state_, lua_script.c_str()); - } - } - - lua_State* lua_state_; -}; -} // namespace lua -} // namespace ROCKSDB_NAMESPACE -#endif // LUA diff --git a/include/rocksdb/utilities/memory_util.h b/include/rocksdb/utilities/memory_util.h index acebc8b4a655..40d9f5646c46 100644 --- a/include/rocksdb/utilities/memory_util.h +++ b/include/rocksdb/utilities/memory_util.h @@ -6,6 +6,7 @@ #pragma once #include +#include #include #include #include @@ -39,8 +40,11 @@ class MemoryUtil { // only report the usage of the input "cache_set" without // including those Cache usage inside the input list "dbs" // of DBs. + // + // Supports vectors of DB* or unique_ptr. + template static Status GetApproximateMemoryUsageByType( - const std::vector& dbs, + const std::vector& dbs, const std::unordered_set cache_set, std::map* usage_by_type); }; diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index 875a132e408f..eb9f973a82b1 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -123,7 +123,8 @@ class OptimisticTransactionDB : public StackableDB { protected: // To Create an OptimisticTransactionDB, call Open() - explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {} + explicit OptimisticTransactionDB(std::unique_ptr&& db) + : StackableDB(std::move(db)) {} }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/utilities/option_change_migration.h b/include/rocksdb/utilities/option_change_migration.h index 0ad00cc860e3..5c13329dc130 100644 --- a/include/rocksdb/utilities/option_change_migration.h +++ b/include/rocksdb/utilities/option_change_migration.h @@ -6,19 +6,47 @@ #pragma once #include +#include +#include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { -// Try to migrate DB created with old_opts to be use new_opts. -// Multiple column families is not supported. -// It is best-effort. No guarantee to succeed. -// A full compaction may be executed. +// Prepares a database to be compatible with new_opts after using old_opts. +// Restructures the LSM tree but does NOT apply new_opts - you must call +// DB::Open(new_opts, dbname) afterward to actually use the new configuration. +// It is best-effort with no guarantee to succeed. A full compaction may be +// executed. +// +// Limitations: single column family only +// // WARNING: using this to migrate from non-FIFO to FIFO compaction // with `Options::compaction_options_fifo.max_table_files_size` > 0 can cause // the whole DB to be dropped right after migration if the migrated data is // larger than `max_table_files_size` -Status OptionChangeMigration(std::string dbname, const Options& old_opts, +Status OptionChangeMigration(const std::string& dbname, const Options& old_opts, const Options& new_opts); + +// Multi-CF version: Prepares a database with multiple column families to be +// compatible with new options after using old options. +// +// REQUIREMENTS: +// - old_cf_descs and new_cf_descs MUST have the same number of CFs +// - old_cf_descs and new_cf_descs MUST have the same CF names IN THE SAME ORDER +// - Adding or dropping CFs is NOT supported - use CreateColumnFamily() or +// DropColumnFamily() separately before/after migration +// +// The function will return InvalidArgument status if these requirements are +// violated. +// +// WARNING: using this to migrate from non-FIFO to FIFO compaction +// with `max_table_files_size` > 0 can cause the whole DB to be dropped right +// after migration if the migrated data is larger than `max_table_files_size` +Status OptionChangeMigration( + const std::string& dbname, const DBOptions& old_db_opts, + const std::vector& old_cf_descs, + const DBOptions& new_db_opts, + const std::vector& new_cf_descs); + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 244989a6c98e..de43ba386282 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -289,6 +289,13 @@ class StackableDB : public DB { return db_->NewAttributeGroupIterator(options, column_families); } + using DB::NewMultiScan; + std::unique_ptr NewMultiScan( + const ReadOptions& opts, ColumnFamilyHandle* column_family, + const MultiScanArgs& scan_opts) override { + return db_->NewMultiScan(opts, column_family, scan_opts); + } + const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } void ReleaseSnapshot(const Snapshot* snapshot) override { @@ -368,6 +375,8 @@ class StackableDB : public DB { void DisableManualCompaction() override { return db_->DisableManualCompaction(); } + void AbortAllCompactions() override { return db_->AbortAllCompactions(); } + void ResumeAllCompactions() override { return db_->ResumeAllCompactions(); } Status WaitForCompact( const WaitForCompactOptions& wait_for_compact_options) override { @@ -379,11 +388,6 @@ class StackableDB : public DB { return db_->NumberLevels(column_family); } - using DB::MaxMemCompactionLevel; - int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override { - return db_->MaxMemCompactionLevel(column_family); - } - using DB::Level0StopWriteTrigger; int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override { return db_->Level0StopWriteTrigger(column_family); @@ -416,7 +420,11 @@ class StackableDB : public DB { Status SyncWAL() override { return db_->SyncWAL(); } + using DB::FlushWAL; Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); } + Status FlushWAL(const FlushWALOptions& options) override { + return db_->FlushWAL(options); + } Status LockWAL() override { return db_->LockWAL(); } @@ -445,6 +453,12 @@ class StackableDB : public DB { db_->GetColumnFamilyMetaData(column_family, cf_meta); } + void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + const GetColumnFamilyMetaDataOptions& options, + ColumnFamilyMetaData* metadata) override { + db_->GetColumnFamilyMetaData(column_family, options, metadata); + } + using DB::StartBlockCacheTrace; Status StartBlockCacheTrace( const TraceOptions& trace_options, @@ -505,13 +519,18 @@ class StackableDB : public DB { return db_->GetFullHistoryTsLow(column_family, ts_low); } + Status GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family, + std::string* newest_timestamp) override { + return db_->GetNewestUserDefinedTimestamp(column_family, newest_timestamp); + } + Status GetSortedWalFiles(VectorWalPtr& files) override { return db_->GetSortedWalFiles(files); } Status GetCurrentWalFile( - std::unique_ptr* current_log_file) override { - return db_->GetCurrentWalFile(current_log_file); + std::unique_ptr* current_wal_file) override { + return db_->GetCurrentWalFile(current_wal_file); } Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override { @@ -527,10 +546,11 @@ class StackableDB : public DB { } using DB::SetOptions; - Status SetOptions(ColumnFamilyHandle* column_family_handle, - const std::unordered_map& - new_options) override { - return db_->SetOptions(column_family_handle, new_options); + Status SetOptions( + const std::unordered_map>& + column_families_opts_map) override { + return db_->SetOptions(column_families_opts_map); } Status SetDBOptions(const std::unordered_map& @@ -554,6 +574,14 @@ class StackableDB : public DB { return db_->GetPropertiesOfTablesInRange(column_family, range, n, props); } + using DB::GetPropertiesOfTablesByLevel; + Status GetPropertiesOfTablesByLevel( + ColumnFamilyHandle* column_family, + std::vector>* props_by_level) + override { + return db_->GetPropertiesOfTablesByLevel(column_family, props_by_level); + } + Status GetUpdatesSince( SequenceNumber seq_number, std::unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options) override { diff --git a/include/rocksdb/utilities/table_properties_collectors.h b/include/rocksdb/utilities/table_properties_collectors.h index 0f79f725e5d8..c8c8af1de6a8 100644 --- a/include/rocksdb/utilities/table_properties_collectors.h +++ b/include/rocksdb/utilities/table_properties_collectors.h @@ -23,15 +23,20 @@ class CompactOnDeletionCollectorFactory // A factory of a table property collector that marks a SST // file as need-compaction when it observe at least "D" deletion // entries in any "N" consecutive entries, or the ratio of tombstone - // entries >= deletion_ratio. + // entries >= deletion_ratio for the entire file. // // @param sliding_window_size "N" // @param deletion_trigger "D" // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction // based on deletion ratio. + // @param min_file_size, a file needs to be at least this size to be marked + // for compaction. See comments above + // TablePropertiesCollector::AddUserKey() for limitations/inaccuracies on + // the file size. CompactOnDeletionCollectorFactory(size_t sliding_window_size, size_t deletion_trigger, - double deletion_ratio); + double deletion_ratio, + uint64_t min_file_size = 0); ~CompactOnDeletionCollectorFactory() override {} @@ -59,6 +64,12 @@ class CompactOnDeletionCollectorFactory } double GetDeletionRatio() const { return deletion_ratio_.load(); } + + uint64_t GetMinFileSize() const { return min_file_size_.load(); } + void SetMinFileSize(uint64_t min_file_size) { + min_file_size_.store(min_file_size); + } + static const char* kClassName() { return "CompactOnDeletionCollector"; } const char* Name() const override { return kClassName(); } @@ -68,6 +79,7 @@ class CompactOnDeletionCollectorFactory std::atomic sliding_window_size_; std::atomic deletion_trigger_; std::atomic deletion_ratio_; + std::atomic min_file_size_; }; // Creates a factory of a table property collector that marks a SST @@ -85,7 +97,8 @@ class CompactOnDeletionCollectorFactory std::shared_ptr NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, size_t deletion_trigger, - double deletion_ratio = 0); + double deletion_ratio = 0, + uint64_t min_file_size = 0); // A factory of a table property collector that marks a SST file as // need-compaction when for the tiering use case, it observes, among all the diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index 6c444ac26df5..51b4eb026211 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -653,7 +653,12 @@ class Transaction { // Change the value of TransactionOptions.lock_timeout (in milliseconds) for // this transaction. // Has no effect on OptimisticTransactions. - virtual void SetLockTimeout(int64_t timeout) = 0; + virtual void SetLockTimeout(int64_t timeout_ms) = 0; + + // Change the value of deadlock_timeout (in milliseconds) for this + // transaction. + // Has no effect on OptimisticTransactions. + virtual void SetDeadlockTimeout(int64_t timeout_ms) = 0; // Return the WriteOptions that will be used during Commit() virtual WriteOptions* GetWriteOptions() = 0; diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 766fe75917c5..e0af0caa0bd1 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -24,9 +24,16 @@ class SecondaryIndex; class TransactionDBMutexFactory; enum TxnDBWritePolicy { - WRITE_COMMITTED = 0, // write only the committed data - WRITE_PREPARED, // write data after the prepare phase of 2pc - WRITE_UNPREPARED // write data before the prepare phase of 2pc + // Write data at transaction commit time + WRITE_COMMITTED = 0, + + // EXPERIMENTAL: The remaining write policies are not as mature, well + // validated, nor as compatible with other features as WRITE_COMMITTED. + + // Write data after the prepare phase of 2pc + WRITE_PREPARED, + // Write data before the prepare phase of 2pc + WRITE_UNPREPARED }; constexpr uint32_t kInitialMaxDeadlocks = 5; @@ -210,6 +217,11 @@ struct TransactionDBOptions { // Other value means the user provides a custom lock manager. std::shared_ptr lock_mgr_handle; + // EXPERIMENTAL + // + // Flag to enable/disable the per key point lock manager. + bool use_per_key_point_lock_mgr = false; + // If true, the TransactionDB implementation might skip concurrency control // unless it is overridden by TransactionOptions or // TransactionDBWriteOptimizations. This can be used in conjunction with @@ -247,10 +259,12 @@ struct TransactionDBOptions { // for more details. std::vector> secondary_indices; - // EXPERIMENTAL, SUBJECT TO CHANGE + // Deprecated, this option has no effect and may be removed in the future. + // Use TransactionOptions::large_txn_commit_optimize_threshold instead. + // // This option is only valid for write committed. If the number of updates in - // a transaction exceeds this threshold, then the transaction commit will skip - // insertions into memtable as an optimization to reduce commit latency. + // a transaction is at least this threshold, then the transaction commit will + // skip insertions into memtable as an optimization to reduce commit latency. // See comment for TransactionOptions::commit_bypass_memtable for more detail. // Setting TransactionOptions::commit_bypass_memtable to true takes precedence // over this option. @@ -310,6 +324,22 @@ struct TransactionOptions { // If negative, TransactionDBOptions::transaction_lock_timeout will be used. int64_t lock_timeout = -1; + // Timeout in microseconds before perform dead lock detection. + // If 0, deadlock detection will be performed immediately. + // + // To optimize performance, this parameter could be tuned. + // + // When deadlock happens very frequently, deadlock timeout should be set to 0, + // so deadlock will be detected immediately. + // + // When deadlock happen very rarely, this timeout could be turned to be + // slightly longer than the typical transaction execution time, so that + // transaction will be waked up to take the lock before this timeout, which + // will allow the transaction to save the CPU time on deadlock detection. + // + // Deadlock timeout is always smaller than lock_timeout. + int64_t deadlock_timeout_us = 500; + // Expiration duration in milliseconds. If non-negative, transactions that // last longer than this many milliseconds will fail to commit. If not set, // a forgotten transaction that is never committed, rolled back, or deleted @@ -357,10 +387,28 @@ struct TransactionOptions { // DeleteRange, SingleDelete. bool write_batch_track_timestamp_size = false; + // The following three options enable optimizations for large transaction + // commit to bypass memtable write. + // - If any transaction's commit should bybass memtable write, + // set commit_bypass_memtable to true. + // - If only bypass memtable write for transactions with >= n operations, + // set commit_bypass_memtable to false, + // large_txn_commit_optimize_threshold to n, and + // large_txn_commit_optimize_byte_threshold to 0. + // Similarly for only optimize when a transaction's write batch size is >= n. + // - If bypass memtable write for transactions with >= n operations or >= x + // bytes, + // set commit_bypass_memtable to false, + // large_txn_commit_optimize_threshold to n, and + // large_txn_commit_optimize_byte_threshold to x. + // + // // EXPERIMENTAL, SUBJECT TO CHANGE // Only supports write-committed policy. If set to true, the transaction will // skip memtable write and ingest into the DB directly during Commit(). This // makes Commit() much faster for transactions with many operations. + // Transaction neeeds to call Prepare() before Commit() for this option to + // take effect. // Transactions with Merge() or PutEntity() is not supported yet. // // Note that the transaction will be ingested as an immutable memtable for @@ -369,15 +417,31 @@ struct TransactionOptions { // due to too many memtables. // Note that the ingestion relies on the transaction's underlying index, // (WriteBatchWithIndex), so updates that are added to the transaction - // without indexing (e.g. added directly to the transaction underlying + // without indexing (i.e. added directly to the transaction underlying // write batch through Transaction::GetWriteBatch()->GetWriteBatch()) - // are not supported. They will not be applied to the DB. + // are not supported, and the optimization will not apply in that case. // // NOTE: since WBWI keep track of the most recent update per key, a Put // followed by a SingleDelete will be written to DB as a SingleDelete. This // can cause flush/compaction to report `num_single_del_mismatch` due to // consecutive SingleDeletes. bool commit_bypass_memtable = false; + + // EXPERIMENTAL, SUBJECT TO CHANGE + // When the number of updates in a transaction is at least this threshold, + // we will enable optimizations for commiting a large transaction. See + // comment for `commit_bypass_memtable` for more optimization detail. + // + // Default: 0 (disabled). + uint32_t large_txn_commit_optimize_threshold = 0; + + // EXPERIMENTAL, SUBJECT TO CHANGE + // When the size of a transaction's write batch is at least this threshold, + // we will enable optimizations for commiting a large transaction. See + // comment for `commit_bypass_memtable` for more optimization detail. + // + // Default: 0 (disabled). + uint64_t large_txn_commit_optimize_byte_threshold = 0; }; // The per-write optimizations that do not involve transactions. TransactionDB diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index 6ff8b587099d..edced15b9ec7 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -90,6 +90,8 @@ class WBWIIterator { // Returns n where the current entry is the n-th update to the current key. // The update count starts from 1. // Only valid if WBWI is created with overwrite_key = true. + // With overwrite_key=false, update count for each entry is not maintained, + // see UpdateExistingEntryWithCfId(). virtual uint32_t GetUpdateCount() const { return 0; } }; @@ -234,7 +236,8 @@ class WriteBatchWithIndex : public WriteBatchBase { Iterator* base_iterator, const ReadOptions* opts = nullptr); // default column family - Iterator* NewIteratorWithBase(Iterator* base_iterator); + Iterator* NewIteratorWithBase(Iterator* base_iterator, + const ReadOptions* opts = nullptr); // Similar to DB::Get() but will only read the key from this batch. // If the batch does not have enough data to resolve Merge operations, @@ -374,11 +377,10 @@ class WriteBatchWithIndex : public WriteBatchBase { uint32_t entry_count = 0; uint32_t overwritten_sd_count = 0; }; - // Will track CF ID, per CF entry count and overwritten sd count. - // Should be enabled when WBWI is empty for correct tracking. - void SetTrackPerCFStat(bool track); const std::unordered_map& GetCFStats() const; + // The total number of operations issued into this WBWI. + size_t GetWBWIOpCount() const; bool GetOverwriteKey() const; private: diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 104a6483dc5c..5fe307d19af8 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -11,16 +11,21 @@ // NOTE: in 'main' development branch, this should be the *next* // minor or major version number planned for release. -#define ROCKSDB_MAJOR 10 -#define ROCKSDB_MINOR 1 +#define ROCKSDB_MAJOR 11 +#define ROCKSDB_MINOR 0 #define ROCKSDB_PATCH 0 -// Do not use these. We made the mistake of declaring macros starting with -// double underscore. Now we have to live with our choice. We'll deprecate these -// at some point -#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR -#define __ROCKSDB_MINOR__ ROCKSDB_MINOR -#define __ROCKSDB_PATCH__ ROCKSDB_PATCH +// Make it easy to do conditional compilation based on version checks, i.e. +// #if ROCKSDB_VERSION_GE(4, 5, 6) +// int thisCoderequiresVersion_4_5_6_OrGreater; +// #else +// int thisCodeIsForOlderVersions; +// #endif +#define ROCKSDB_MAKE_VERSION_INT(a, b, c) ((a) * 1000000 + (b) * 1000 + (c)) +#define ROCKSDB_VERSION_INT \ + ROCKSDB_MAKE_VERSION_INT(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH) +#define ROCKSDB_VERSION_GE(a, b, c) \ + (ROCKSDB_VERSION_INT >= ROCKSDB_MAKE_VERSION_INT(a, b, c)) namespace ROCKSDB_NAMESPACE { // Returns a set of properties indicating how/when/where this version of RocksDB diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index a60847ead37d..5dd7be6cd1e4 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.4) +cmake_minimum_required(VERSION 3.11) set(JAVA_JUNIT_VERSION "4.13.1") set(JAVA_HAMCR_VERSION "2.2") @@ -182,6 +182,7 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/HyperClockCache.java src/main/java/org/rocksdb/ImportColumnFamilyOptions.java src/main/java/org/rocksdb/IndexShorteningMode.java + src/main/java/org/rocksdb/IndexSearchType.java src/main/java/org/rocksdb/IndexType.java src/main/java/org/rocksdb/InfoLogLevel.java src/main/java/org/rocksdb/IngestExternalFileOptions.java diff --git a/java/rocksjni/compaction_options_fifo.cc b/java/rocksjni/compaction_options_fifo.cc index 535562fb47f7..f23eee6c3d2a 100644 --- a/java/rocksjni/compaction_options_fifo.cc +++ b/java/rocksjni/compaction_options_fifo.cc @@ -71,6 +71,54 @@ jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(JNIEnv*, jclass, return static_cast(opt->allow_compaction); } +/* + * Class: org_rocksdb_CompactionOptionsFIFO + * Method: setMaxDataFilesSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_CompactionOptionsFIFO_setMaxDataFilesSize( + JNIEnv*, jclass, jlong jhandle, jlong jmax_data_files_size) { + auto* opt = + reinterpret_cast(jhandle); + opt->max_data_files_size = static_cast(jmax_data_files_size); +} + +/* + * Class: org_rocksdb_CompactionOptionsFIFO + * Method: maxDataFilesSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_CompactionOptionsFIFO_maxDataFilesSize(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = + reinterpret_cast(jhandle); + return static_cast(opt->max_data_files_size); +} + +/* + * Class: org_rocksdb_CompactionOptionsFIFO + * Method: setUseKvRatioCompaction + * Signature: (JZ)V + */ +void Java_org_rocksdb_CompactionOptionsFIFO_setUseKvRatioCompaction( + JNIEnv*, jclass, jlong jhandle, jboolean use_kv_ratio_compaction) { + auto* opt = + reinterpret_cast(jhandle); + opt->use_kv_ratio_compaction = static_cast(use_kv_ratio_compaction); +} + +/* + * Class: org_rocksdb_CompactionOptionsFIFO + * Method: useKvRatioCompaction + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_CompactionOptionsFIFO_useKvRatioCompaction( + JNIEnv*, jclass, jlong jhandle) { + auto* opt = + reinterpret_cast(jhandle); + return static_cast(opt->use_kv_ratio_compaction); +} + /* * Class: org_rocksdb_CompactionOptionsFIFO * Method: disposeInternal diff --git a/java/rocksjni/config_options.cc b/java/rocksjni/config_options.cc index 1532dd9e80ad..2f243f978423 100644 --- a/java/rocksjni/config_options.cc +++ b/java/rocksjni/config_options.cc @@ -19,9 +19,9 @@ * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv *, jclass, +void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv*, jclass, jlong jhandle) { - auto *co = reinterpret_cast(jhandle); + auto* co = reinterpret_cast(jhandle); assert(co != nullptr); delete co; } @@ -31,8 +31,8 @@ void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv *, jclass, * Method: newConfigOptions * Signature: ()J */ -jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) { - auto *cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions(); +jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv*, jclass) { + auto* cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions(); return GET_CPLUSPLUS_POINTER(cfg_opt); } @@ -41,11 +41,11 @@ jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) { * Method: setEnv * Signature: (JJ;)V */ -void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv *, jclass, jlong handle, +void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv*, jclass, jlong handle, jlong rocksdb_env_handle) { - auto *cfg_opt = reinterpret_cast(handle); - auto *rocksdb_env = - reinterpret_cast(rocksdb_env_handle); + auto* cfg_opt = reinterpret_cast(handle); + auto* rocksdb_env = + reinterpret_cast(rocksdb_env_handle); cfg_opt->env = rocksdb_env; } @@ -54,10 +54,10 @@ void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv *, jclass, jlong handle, * Method: setDelimiter * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass, +void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv* env, jclass, jlong handle, jstring s) { - auto *cfg_opt = reinterpret_cast(handle); - const char *delim = env->GetStringUTFChars(s, nullptr); + auto* cfg_opt = reinterpret_cast(handle); + const char* delim = env->GetStringUTFChars(s, nullptr); if (delim == nullptr) { // exception thrown: OutOfMemoryError return; @@ -71,10 +71,10 @@ void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass, * Method: setIgnoreUnknownOptions * Signature: (JZ)V */ -void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass, +void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv*, jclass, jlong handle, jboolean b) { - auto *cfg_opt = reinterpret_cast(handle); + auto* cfg_opt = reinterpret_cast(handle); cfg_opt->ignore_unknown_options = static_cast(b); } @@ -83,10 +83,10 @@ void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass, * Method: setInputStringsEscaped * Signature: (JZ)V */ -void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass, +void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv*, jclass, jlong handle, jboolean b) { - auto *cfg_opt = reinterpret_cast(handle); + auto* cfg_opt = reinterpret_cast(handle); cfg_opt->input_strings_escaped = static_cast(b); } @@ -95,9 +95,9 @@ void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass, * Method: setSanityLevel * Signature: (JI)V */ -void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv *, jclass, +void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv*, jclass, jlong handle, jbyte level) { - auto *cfg_opt = reinterpret_cast(handle); + auto* cfg_opt = reinterpret_cast(handle); cfg_opt->sanity_level = ROCKSDB_NAMESPACE::SanityLevelJni::toCppSanityLevel(level); } diff --git a/java/rocksjni/env_options.cc b/java/rocksjni/env_options.cc index c3a9ae825da1..3f2577193e65 100644 --- a/java/rocksjni/env_options.cc +++ b/java/rocksjni/env_options.cc @@ -13,28 +13,28 @@ #include "rocksdb/env.h" #include "rocksjni/cplusplus_to_java_convert.h" -#define ENV_OPTIONS_SET_BOOL(_jhandle, _opt) \ - reinterpret_cast(_jhandle)->_opt = \ +#define ENV_OPTIONS_SET_BOOL(_jhandle, _opt) \ + reinterpret_cast(_jhandle)->_opt = \ static_cast(_opt) -#define ENV_OPTIONS_SET_SIZE_T(_jhandle, _opt) \ - reinterpret_cast(_jhandle)->_opt = \ +#define ENV_OPTIONS_SET_SIZE_T(_jhandle, _opt) \ + reinterpret_cast(_jhandle)->_opt = \ static_cast(_opt) -#define ENV_OPTIONS_SET_UINT64_T(_jhandle, _opt) \ - reinterpret_cast(_jhandle)->_opt = \ +#define ENV_OPTIONS_SET_UINT64_T(_jhandle, _opt) \ + reinterpret_cast(_jhandle)->_opt = \ static_cast(_opt) #define ENV_OPTIONS_GET(_jhandle, _opt) \ - reinterpret_cast(_jhandle)->_opt + reinterpret_cast(_jhandle)->_opt /* * Class: org_rocksdb_EnvOptions * Method: newEnvOptions * Signature: ()J */ -jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv *, jclass) { - auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions(); +jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv*, jclass) { + auto* env_opt = new ROCKSDB_NAMESPACE::EnvOptions(); return GET_CPLUSPLUS_POINTER(env_opt); } @@ -43,11 +43,11 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv *, jclass) { * Method: newEnvOptions * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass, +jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv*, jclass, jlong jdboptions_handle) { - auto *db_options = - reinterpret_cast(jdboptions_handle); - auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions(*db_options); + auto* db_options = + reinterpret_cast(jdboptions_handle); + auto* env_opt = new ROCKSDB_NAMESPACE::EnvOptions(*db_options); return GET_CPLUSPLUS_POINTER(env_opt); } @@ -56,9 +56,9 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv *, jclass, +void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv*, jclass, jlong jhandle) { - auto *eo = reinterpret_cast(jhandle); + auto* eo = reinterpret_cast(jhandle); assert(eo != nullptr); delete eo; } @@ -68,8 +68,7 @@ void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv *, jclass, * Method: setUseMmapReads * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jclass, - jlong jhandle, +void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv*, jclass, jlong jhandle, jboolean use_mmap_reads) { ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_reads); } @@ -79,7 +78,7 @@ void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jclass, * Method: useMmapReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jclass, +jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_mmap_reads); } @@ -89,7 +88,7 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jclass, * Method: setUseMmapWrites * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jclass, +void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv*, jclass, jlong jhandle, jboolean use_mmap_writes) { ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_writes); @@ -100,7 +99,7 @@ void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jclass, * Method: useMmapWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jclass, +jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_mmap_writes); } @@ -110,7 +109,7 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jclass, * Method: setUseDirectReads * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jclass, +void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv*, jclass, jlong jhandle, jboolean use_direct_reads) { ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads); @@ -121,7 +120,7 @@ void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jclass, * Method: useDirectReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jclass, +jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_direct_reads); } @@ -132,7 +131,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jclass, * Signature: (JZ)V */ void Java_org_rocksdb_EnvOptions_setUseDirectWrites( - JNIEnv *, jclass, jlong jhandle, jboolean use_direct_writes) { + JNIEnv*, jclass, jlong jhandle, jboolean use_direct_writes) { ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes); } @@ -141,7 +140,7 @@ void Java_org_rocksdb_EnvOptions_setUseDirectWrites( * Method: useDirectWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jclass, +jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_direct_writes); } @@ -151,7 +150,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jclass, * Method: setAllowFallocate * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jclass, +void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv*, jclass, jlong jhandle, jboolean allow_fallocate) { ENV_OPTIONS_SET_BOOL(jhandle, allow_fallocate); @@ -162,7 +161,7 @@ void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jclass, * Method: allowFallocate * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jclass, +jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, allow_fallocate); } @@ -172,8 +171,7 @@ jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jclass, * Method: setSetFdCloexec * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jclass, - jlong jhandle, +void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv*, jclass, jlong jhandle, jboolean set_fd_cloexec) { ENV_OPTIONS_SET_BOOL(jhandle, set_fd_cloexec); } @@ -183,7 +181,7 @@ void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jclass, * Method: setFdCloexec * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jclass, +jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, set_fd_cloexec); } @@ -193,8 +191,7 @@ jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jclass, * Method: setBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jclass, - jlong jhandle, +void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv*, jclass, jlong jhandle, jlong bytes_per_sync) { ENV_OPTIONS_SET_UINT64_T(jhandle, bytes_per_sync); } @@ -204,8 +201,7 @@ void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jclass, * Method: bytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jclass, - jlong jhandle) { +jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, bytes_per_sync); } @@ -215,7 +211,7 @@ jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jclass, * Signature: (JZ)V */ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize( - JNIEnv *, jclass, jlong jhandle, jboolean fallocate_with_keep_size) { + JNIEnv*, jclass, jlong jhandle, jboolean fallocate_with_keep_size) { ENV_OPTIONS_SET_BOOL(jhandle, fallocate_with_keep_size); } @@ -224,7 +220,7 @@ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize( * Method: fallocateWithKeepSize * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jclass, +jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, fallocate_with_keep_size); } @@ -235,7 +231,7 @@ jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jclass, * Signature: (JJ)V */ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize( - JNIEnv *, jclass, jlong jhandle, jlong compaction_readahead_size) { + JNIEnv*, jclass, jlong jhandle, jlong compaction_readahead_size) { ENV_OPTIONS_SET_SIZE_T(jhandle, compaction_readahead_size); } @@ -244,7 +240,7 @@ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize( * Method: compactionReadaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jclass, +jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, compaction_readahead_size); } @@ -255,7 +251,7 @@ jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jclass, * Signature: (JJ)V */ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize( - JNIEnv *, jclass, jlong jhandle, jlong writable_file_max_buffer_size) { + JNIEnv*, jclass, jlong jhandle, jlong writable_file_max_buffer_size) { ENV_OPTIONS_SET_SIZE_T(jhandle, writable_file_max_buffer_size); } @@ -264,7 +260,7 @@ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize( * Method: writableFileMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jclass, +jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv*, jclass, jlong jhandle) { return ENV_OPTIONS_GET(jhandle, writable_file_max_buffer_size); } @@ -274,11 +270,11 @@ jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jclass, * Method: setRateLimiter * Signature: (JJ)V */ -void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv *, jclass, jlong jhandle, +void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv*, jclass, jlong jhandle, jlong rl_handle) { - auto *sptr_rate_limiter = - reinterpret_cast *>( + auto* sptr_rate_limiter = + reinterpret_cast*>( rl_handle); - auto *env_opt = reinterpret_cast(jhandle); + auto* env_opt = reinterpret_cast(jhandle); env_opt->rate_limiter = sptr_rate_limiter->get(); } diff --git a/java/rocksjni/import_column_family_options.cc b/java/rocksjni/import_column_family_options.cc index 1a9bded516b1..cd7bdfe007fa 100644 --- a/java/rocksjni/import_column_family_options.cc +++ b/java/rocksjni/import_column_family_options.cc @@ -16,8 +16,8 @@ * Signature: ()J */ jlong Java_org_rocksdb_ImportColumnFamilyOptions_newImportColumnFamilyOptions( - JNIEnv *, jclass) { - ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *opts = + JNIEnv*, jclass) { + ROCKSDB_NAMESPACE::ImportColumnFamilyOptions* opts = new ROCKSDB_NAMESPACE::ImportColumnFamilyOptions(); return GET_CPLUSPLUS_POINTER(opts); } @@ -28,9 +28,9 @@ jlong Java_org_rocksdb_ImportColumnFamilyOptions_newImportColumnFamilyOptions( * Signature: (JZ)V */ void Java_org_rocksdb_ImportColumnFamilyOptions_setMoveFiles( - JNIEnv *, jobject, jlong jhandle, jboolean jmove_files) { - auto *options = - reinterpret_cast(jhandle); + JNIEnv*, jobject, jlong jhandle, jboolean jmove_files) { + auto* options = + reinterpret_cast(jhandle); options->move_files = static_cast(jmove_files); } @@ -39,10 +39,10 @@ void Java_org_rocksdb_ImportColumnFamilyOptions_setMoveFiles( * Method: moveFiles * Signature: (J)Z */ -jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv *, jobject, +jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv*, jobject, jlong jhandle) { - auto *options = - reinterpret_cast(jhandle); + auto* options = + reinterpret_cast(jhandle); return static_cast(options->move_files); } @@ -51,9 +51,9 @@ jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv *, jobject, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ImportColumnFamilyOptions_disposeInternal(JNIEnv *, +void Java_org_rocksdb_ImportColumnFamilyOptions_disposeInternal(JNIEnv*, jobject, jlong jhandle) { - delete reinterpret_cast( + delete reinterpret_cast( jhandle); } \ No newline at end of file diff --git a/java/rocksjni/kv_helper.h b/java/rocksjni/kv_helper.h index 5f0a8ffc57eb..75f254b173cd 100644 --- a/java/rocksjni/kv_helper.h +++ b/java/rocksjni/kv_helper.h @@ -81,7 +81,7 @@ class KVException : public std::exception { } } - KVException(jint code) : kCode_(code){}; + KVException(jint code) : kCode_(code) {}; virtual const char* what() const noexcept { return "Exception raised by JNI. There may be a Java exception in the " @@ -176,13 +176,13 @@ class JByteArrayPinnableSlice { : env_(env), jbuffer_(jbuffer), jbuffer_off_(jbuffer_off), - jbuffer_len_(jbuffer_len){}; + jbuffer_len_(jbuffer_len) {}; /** * @brief Construct an empty new JByteArrayPinnableSlice object * */ - JByteArrayPinnableSlice(JNIEnv* env) : env_(env){}; + JByteArrayPinnableSlice(JNIEnv* env) : env_(env) {}; PinnableSlice& pinnable_slice() { return pinnable_slice_; } diff --git a/java/rocksjni/memory_util.cc b/java/rocksjni/memory_util.cc index c87c4f403bbb..d60a89296481 100644 --- a/java/rocksjni/memory_util.cc +++ b/java/rocksjni/memory_util.cc @@ -21,9 +21,9 @@ * Signature: ([J[J)Ljava/util/Map; */ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( - JNIEnv *env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) { + JNIEnv* env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) { jboolean has_exception = JNI_FALSE; - std::vector dbs = + std::vector dbs = ROCKSDB_NAMESPACE::JniUtil::fromJPointers( env, jdb_handles, &has_exception); if (has_exception == JNI_TRUE) { @@ -31,18 +31,18 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( return nullptr; } - std::unordered_set cache_set; + std::unordered_set cache_set; jsize cache_handle_count = env->GetArrayLength(jcache_handles); if (cache_handle_count > 0) { - jlong *ptr_jcache_handles = + jlong* ptr_jcache_handles = env->GetLongArrayElements(jcache_handles, nullptr); if (ptr_jcache_handles == nullptr) { // exception thrown: OutOfMemoryError return nullptr; } for (jsize i = 0; i < cache_handle_count; i++) { - auto *cache_ptr = - reinterpret_cast *>( + auto* cache_ptr = + reinterpret_cast*>( ptr_jcache_handles[i]); cache_set.insert(cache_ptr->get()); } @@ -68,7 +68,7 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( jobject> fn_map_kv = [env]( const std::pair &pair) { + uint64_t>& pair) { // Construct key const jobject jusage_type = ROCKSDB_NAMESPACE::ByteJni::valueOf( env, ROCKSDB_NAMESPACE::MemoryUsageTypeJni::toJavaMemoryUsageType( diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index c986511a3f2f..3166e6625090 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -1959,30 +1959,6 @@ jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(JNIEnv*, jclass, return static_cast(opt->skip_stats_update_on_db_open); } -/* - * Class: org_rocksdb_Options - * Method: setSkipCheckingSstFileSizesOnDbOpen - * Signature: (JZ)V - */ -void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen( - JNIEnv*, jclass, jlong jhandle, - jboolean jskip_checking_sst_file_sizes_on_db_open) { - auto* opt = reinterpret_cast(jhandle); - opt->skip_checking_sst_file_sizes_on_db_open = - static_cast(jskip_checking_sst_file_sizes_on_db_open); -} - -/* - * Class: org_rocksdb_Options - * Method: skipCheckingSstFileSizesOnDbOpen - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen( - JNIEnv*, jclass, jlong jhandle) { - auto* opt = reinterpret_cast(jhandle); - return static_cast(opt->skip_checking_sst_file_sizes_on_db_open); -} - /* * Class: org_rocksdb_Options * Method: setWalRecoveryMode @@ -2055,29 +2031,6 @@ void Java_org_rocksdb_Options_setWalFilter(JNIEnv*, jclass, jlong jhandle, opt->wal_filter = wal_filter; } -/* - * Class: org_rocksdb_Options - * Method: setFailIfOptionsFileError - * Signature: (JZ)V - */ -void Java_org_rocksdb_Options_setFailIfOptionsFileError( - JNIEnv*, jclass, jlong jhandle, jboolean jfail_if_options_file_error) { - auto* opt = reinterpret_cast(jhandle); - opt->fail_if_options_file_error = - static_cast(jfail_if_options_file_error); -} - -/* - * Class: org_rocksdb_Options - * Method: failIfOptionsFileError - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_Options_failIfOptionsFileError(JNIEnv*, jclass, - jlong jhandle) { - auto* opt = reinterpret_cast(jhandle); - return static_cast(opt->fail_if_options_file_error); -} - /* * Class: org_rocksdb_Options * Method: setDumpMallocStats @@ -2456,28 +2409,6 @@ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge( ->min_write_buffer_number_to_merge = static_cast(jmin_write_buffer_number_to_merge); } -/* - * Class: org_rocksdb_Options - * Method: maxWriteBufferNumberToMaintain - * Signature: (J)I - */ -jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv*, jclass, - jlong jhandle) { - return reinterpret_cast(jhandle) - ->max_write_buffer_number_to_maintain; -} - -/* - * Class: org_rocksdb_Options - * Method: setMaxWriteBufferNumberToMaintain - * Signature: (JI)V - */ -void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain( - JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number_to_maintain) { - reinterpret_cast(jhandle) - ->max_write_buffer_number_to_maintain = - static_cast(jmax_write_buffer_number_to_maintain); -} /* * Class: org_rocksdb_Options @@ -4496,29 +4427,6 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge( static_cast(jmin_write_buffer_number_to_merge); } -/* - * Class: org_rocksdb_ColumnFamilyOptions - * Method: maxWriteBufferNumberToMaintain - * Signature: (J)I - */ -jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain( - JNIEnv*, jclass, jlong jhandle) { - return reinterpret_cast(jhandle) - ->max_write_buffer_number_to_maintain; -} - -/* - * Class: org_rocksdb_ColumnFamilyOptions - * Method: setMaxWriteBufferNumberToMaintain - * Signature: (JI)V - */ -void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain( - JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number_to_maintain) { - reinterpret_cast(jhandle) - ->max_write_buffer_number_to_maintain = - static_cast(jmax_write_buffer_number_to_maintain); -} - /* * Class: org_rocksdb_ColumnFamilyOptions * Method: setCompressionType @@ -7427,30 +7335,6 @@ jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(JNIEnv*, jclass, return static_cast(opt->skip_stats_update_on_db_open); } -/* - * Class: org_rocksdb_DBOptions - * Method: setSkipCheckingSstFileSizesOnDbOpen - * Signature: (JZ)V - */ -void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen( - JNIEnv*, jclass, jlong jhandle, - jboolean jskip_checking_sst_file_sizes_on_db_open) { - auto* opt = reinterpret_cast(jhandle); - opt->skip_checking_sst_file_sizes_on_db_open = - static_cast(jskip_checking_sst_file_sizes_on_db_open); -} - -/* - * Class: org_rocksdb_DBOptions - * Method: skipCheckingSstFileSizesOnDbOpen - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen( - JNIEnv*, jclass, jlong jhandle) { - auto* opt = reinterpret_cast(jhandle); - return static_cast(opt->skip_checking_sst_file_sizes_on_db_open); -} - /* * Class: org_rocksdb_DBOptions * Method: setWalRecoveryMode @@ -7524,29 +7408,6 @@ void Java_org_rocksdb_DBOptions_setWalFilter(JNIEnv*, jclass, jlong jhandle, opt->wal_filter = wal_filter; } -/* - * Class: org_rocksdb_DBOptions - * Method: setFailIfOptionsFileError - * Signature: (JZ)V - */ -void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError( - JNIEnv*, jclass, jlong jhandle, jboolean jfail_if_options_file_error) { - auto* opt = reinterpret_cast(jhandle); - opt->fail_if_options_file_error = - static_cast(jfail_if_options_file_error); -} - -/* - * Class: org_rocksdb_DBOptions - * Method: failIfOptionsFileError - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(JNIEnv*, jclass, - jlong jhandle) { - auto* opt = reinterpret_cast(jhandle); - return static_cast(opt->fail_if_options_file_error); -} - /* * Class: org_rocksdb_DBOptions * Method: setDumpMallocStats @@ -8170,26 +8031,6 @@ jboolean Java_org_rocksdb_ReadOptions_tailing(JNIEnv*, jclass, jlong jhandle) { return reinterpret_cast(jhandle)->tailing; } -/* - * Class: org_rocksdb_ReadOptions - * Method: managed - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_ReadOptions_managed(JNIEnv*, jclass, jlong jhandle) { - return reinterpret_cast(jhandle)->managed; -} - -/* - * Class: org_rocksdb_ReadOptions - * Method: setManaged - * Signature: (JZ)V - */ -void Java_org_rocksdb_ReadOptions_setManaged(JNIEnv*, jclass, jlong jhandle, - jboolean jmanaged) { - reinterpret_cast(jhandle)->managed = - static_cast(jmanaged); -} - /* * Class: org_rocksdb_ReadOptions * Method: totalOrderSeek diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index d0f288ca8281..9600a736573a 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5101,9 +5101,9 @@ class TickerTypeJni { return -0x1; case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ: return -0x2; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_DEPRECATED: return -0x3; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL: + case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL_DEPRECATED: return -0x4; case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB: return -0x5; @@ -5195,6 +5195,8 @@ class TickerTypeJni { return -0x2F; case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES: return -0x30; + case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_RESUMED_BYTES: + return -0x5F; case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES: return -0x31; case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES: @@ -5273,6 +5275,38 @@ class TickerTypeJni { return -0x56; case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT: return -0x57; + case ROCKSDB_NAMESPACE::Tickers::FIFO_CHANGE_TEMPERATURE_COMPACTIONS: + return -0x58; + case ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_BYTES: + return -0x59; + case ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_COUNT: + return -0x5A; + case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES: + return -0x5B; + case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT: + return -0x5C; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_WBWI_INGEST: + return -0x5D; + case ROCKSDB_NAMESPACE::Tickers::SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT: + return -0x5E; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_CALLS: + return -0x60; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_ERRORS: + return -0x61; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_PREFETCHED: + return -0x62; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_FROM_CACHE: + return -0x63; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BYTES: + return -0x64; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BLOCKS_WASTED: + return -0x65; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_REQUESTS: + return -0x66; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_COALESCED_NONADJACENT: + return -0x67; + case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_SEEK_ERRORS: + return -0x68; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // -0x54 is the max value at this time. Since these values are exposed // directly to Java clients, we'll keep the value the same till the next @@ -5560,9 +5594,9 @@ class TickerTypeJni { case -0x2: return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ; case -0x3: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_DEPRECATED; case -0x4: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL; + return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL_DEPRECATED; case -0x5: return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB; case -0x6: @@ -5654,6 +5688,8 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES; case -0x30: return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES; + case -0x5F: + return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_RESUMED_BYTES; case -0x31: return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES; case -0x32: @@ -5735,6 +5771,39 @@ class TickerTypeJni { case -0x57: return ROCKSDB_NAMESPACE::Tickers:: FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT; + case -0x58: + return ROCKSDB_NAMESPACE::Tickers::FIFO_CHANGE_TEMPERATURE_COMPACTIONS; + case -0x59: + return ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_BYTES; + case -0x5A: + return ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_COUNT; + case -0x5B: + return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES; + case -0x5C: + return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT; + case -0x5D: + return ROCKSDB_NAMESPACE::Tickers::NUMBER_WBWI_INGEST; + case -0x5E: + return ROCKSDB_NAMESPACE::Tickers:: + SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT; + case -0x60: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_CALLS; + case -0x61: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_ERRORS; + case -0x62: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_PREFETCHED; + case -0x63: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_FROM_CACHE; + case -0x64: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BYTES; + case -0x65: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BLOCKS_WASTED; + case -0x66: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_REQUESTS; + case -0x67: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_COALESCED_NONADJACENT; + case -0x68: + return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_SEEK_ERRORS; case -0x54: // -0x54 is the max value at this time. Since these values are exposed // directly to Java clients, we'll keep the value the same till the next @@ -5889,8 +5958,15 @@ class HistogramTypeJni { return 0x3C; case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: return 0x3D; + case ROCKSDB_NAMESPACE::Histograms::COMPACTION_PREFETCH_BYTES: + return 0x3F; + case ROCKSDB_NAMESPACE::Histograms::MULTISCAN_PREPARE_MICROS: + return 0x40; + case ROCKSDB_NAMESPACE::Histograms::MULTISCAN_BLOCKS_PER_PREPARE: + return 0x41; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: - // 0x3D for backwards compatibility on current minor version. + // 0x3E is reserved for backwards compatibility on current minor + // version. return 0x3E; default: // undefined/default @@ -6033,8 +6109,15 @@ class HistogramTypeJni { case 0x3D: return ROCKSDB_NAMESPACE::Histograms:: TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; + case 0x3F: + return ROCKSDB_NAMESPACE::Histograms::COMPACTION_PREFETCH_BYTES; + case 0x40: + return ROCKSDB_NAMESPACE::Histograms::MULTISCAN_PREPARE_MICROS; + case 0x41: + return ROCKSDB_NAMESPACE::Histograms::MULTISCAN_BLOCKS_PER_PREPARE; case 0x3E: - // 0x1F for backwards compatibility on current minor version. + // 0x3E is reserved for backwards compatibility on current minor + // version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; default: @@ -6933,6 +7016,44 @@ class DataBlockIndexTypeJni { } }; +// The portal class for org.rocksdb.IndexSearchType +class IndexSearchTypeJni { + public: + // Returns the equivalent org.rocksdb.IndexSearchType for the provided + // C++ ROCKSDB_NAMESPACE::BlockSearchType enum + static jbyte toJavaIndexSearchType( + const ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType& + index_block_search_type) { + switch (index_block_search_type) { + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::kBinary: + return 0x0; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType:: + kInterpolation: + return 0x1; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::BlockSearchType enum for + // the provided Java org.rocksdb.IndexSearchType + static ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType + toCppIndexSearchType(jbyte jindex_search_type) { + switch (jindex_search_type) { + case 0x0: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType:: + kBinary; + case 0x1: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType:: + kInterpolation; + default: + // undefined/default + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType:: + kBinary; + } + } +}; + // The portal class for org.rocksdb.ChecksumType class ChecksumTypeJni { public: @@ -9117,7 +9238,7 @@ class BlockBasedTableOptionsJni } jmethodID method_id_init = - env->GetMethodID(jclazz, "", "(ZZZZBBDBZJIIIJZZZZZIIZZBBJD)V"); + env->GetMethodID(jclazz, "", "(ZZZZBBDBZJIIIJZZZZZIIZZJJBBBJD)V"); if (method_id_init == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -9162,8 +9283,13 @@ class BlockBasedTableOptionsJni table_factory_options->format_version, table_factory_options->enable_index_compression, table_factory_options->block_align, + static_cast(table_factory_options->super_block_alignment_size), + static_cast( + table_factory_options->super_block_alignment_space_overhead_ratio), IndexShorteningModeJni::toJavaIndexShorteningMode( table_factory_options->index_shortening), + IndexSearchTypeJni::toJavaIndexSearchType( + table_factory_options->index_block_search_type), FilterPolicyJni::toJavaIndexType(filter_policy_type), filter_policy_handle, filter_policy_config_value); if (env->ExceptionCheck()) { diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index 9561b3893661..4a33d4e2f5e4 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -34,11 +34,12 @@ #undef min #endif -jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path, - std::function - open_fn) { +jlong rocksdb_open_helper( + JNIEnv* env, jlong jopt_handle, jstring jdb_path, + std::function*)> + open_fn) { const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); if (db_path == nullptr) { // exception thrown: OutOfMemoryError @@ -46,13 +47,13 @@ jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path, } auto* opt = reinterpret_cast(jopt_handle); - ROCKSDB_NAMESPACE::DB* db = nullptr; + std::unique_ptr db; ROCKSDB_NAMESPACE::Status s = open_fn(*opt, db_path, &db); env->ReleaseStringUTFChars(jdb_path, db_path); if (s.ok()) { - return GET_CPLUSPLUS_POINTER(db); + return GET_CPLUSPLUS_POINTER(db.release()); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); return 0; @@ -67,11 +68,12 @@ jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path, jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path) { - return rocksdb_open_helper(env, jopt_handle, jdb_path, - (ROCKSDB_NAMESPACE::Status(*)( - const ROCKSDB_NAMESPACE::Options&, - const std::string&, ROCKSDB_NAMESPACE::DB**)) & - ROCKSDB_NAMESPACE::DB::Open); + return rocksdb_open_helper( + env, jopt_handle, jdb_path, + [](const ROCKSDB_NAMESPACE::Options& options, const std::string& db_path, + std::unique_ptr* db) { + return ROCKSDB_NAMESPACE::DB::Open(options, db_path, db); + }); } /* @@ -87,7 +89,7 @@ jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Z( env, jopt_handle, jdb_path, [error_if_wal_file_exists](const ROCKSDB_NAMESPACE::Options& options, const std::string& db_path, - ROCKSDB_NAMESPACE::DB** db) { + std::unique_ptr* db) { return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, db_path, db, error_if_wal_file_exists); }); @@ -100,7 +102,7 @@ jlongArray rocksdb_open_helper( const ROCKSDB_NAMESPACE::DBOptions&, const std::string&, const std::vector&, std::vector*, - ROCKSDB_NAMESPACE::DB**)> + std::unique_ptr*)> open_fn) { const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); if (db_path == nullptr) { @@ -141,7 +143,7 @@ jlongArray rocksdb_open_helper( auto* opt = reinterpret_cast(jopt_handle); std::vector cf_handles; - ROCKSDB_NAMESPACE::DB* db = nullptr; + std::unique_ptr db; ROCKSDB_NAMESPACE::Status s = open_fn(*opt, db_path, column_families, &cf_handles, &db); @@ -157,7 +159,7 @@ jlongArray rocksdb_open_helper( const jsize resultsLen = 1 + len_cols; // db handle + column family handles std::unique_ptr results = std::unique_ptr(new jlong[resultsLen]); - results[0] = GET_CPLUSPLUS_POINTER(db); + results[0] = GET_CPLUSPLUS_POINTER(db.release()); for (int i = 1; i <= len_cols; i++) { results[i] = GET_CPLUSPLUS_POINTER(cf_handles[i - 1]); } @@ -196,7 +198,7 @@ jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3JZ( const std::vector& column_families, std::vector* handles, - ROCKSDB_NAMESPACE::DB** db) { + std::unique_ptr* db) { return ROCKSDB_NAMESPACE::DB::OpenForReadOnly( options, db_path, column_families, handles, db, error_if_wal_file_exists); @@ -213,12 +215,15 @@ jlongArray Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2_3_3B_3J( jobjectArray jcolumn_names, jlongArray jcolumn_options) { return rocksdb_open_helper( env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options, - (ROCKSDB_NAMESPACE::Status(*)( - const ROCKSDB_NAMESPACE::DBOptions&, const std::string&, - const std::vector&, - std::vector*, - ROCKSDB_NAMESPACE::DB**)) & - ROCKSDB_NAMESPACE::DB::Open); + [](const ROCKSDB_NAMESPACE::DBOptions& options, + const std::string& db_path, + const std::vector& + column_families, + std::vector* handles, + std::unique_ptr* db) { + return ROCKSDB_NAMESPACE::DB::Open(options, db_path, column_families, + handles, db); + }); } /* @@ -240,7 +245,7 @@ jlong Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_S env, jopt_handle, jdb_path, [secondary_db_path](const ROCKSDB_NAMESPACE::Options& options, const std::string& db_path, - ROCKSDB_NAMESPACE::DB** db) { + std::unique_ptr* db) { return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(options, db_path, secondary_db_path, db); }); @@ -276,7 +281,7 @@ Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_ const std::vector& column_families, std::vector* handles, - ROCKSDB_NAMESPACE::DB** db) { + std::unique_ptr* db) { return ROCKSDB_NAMESPACE::DB::OpenAsSecondary( options, db_path, secondary_db_path, column_families, handles, db); }); @@ -1210,6 +1215,9 @@ jint Java_org_rocksdb_RocksDB_getDirect(JNIEnv* env, jclass /*jdb*/, db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice()); } + if (s.IsNotFound()) { + return ROCKSDB_NAMESPACE::KVException::kNotFound; + } ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.Fetch(); } catch (ROCKSDB_NAMESPACE::KVException& e) { @@ -1453,10 +1461,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(JNIEnv* env, jclass, try { ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); - ROCKSDB_NAMESPACE::KVException::ThrowOnError( - env, + ROCKSDB_NAMESPACE::Status s = db->Get(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(), - key.slice(), &value.pinnable_slice())); + key.slice(), &value.pinnable_slice()); + if (s.IsNotFound()) { + return nullptr; + } + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.NewByteArray(); } catch (ROCKSDB_NAMESPACE::KVException&) { @@ -1484,9 +1495,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(JNIEnv* env, jclass, try { ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); - ROCKSDB_NAMESPACE::KVException::ThrowOnError( - env, db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(), - &value.pinnable_slice())); + ROCKSDB_NAMESPACE::Status s = + db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(), + &value.pinnable_slice()); + if (s.IsNotFound()) { + return nullptr; + } + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.NewByteArray(); } catch (ROCKSDB_NAMESPACE::KVException&) { @@ -1509,11 +1524,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(JNIEnv* env, jclass, try { ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); - ROCKSDB_NAMESPACE::KVException::ThrowOnError( - env, - db->Get( - *reinterpret_cast(jropt_handle), - db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice())); + ROCKSDB_NAMESPACE::Status s = db->Get( + *reinterpret_cast(jropt_handle), + db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice()); + if (s.IsNotFound()) { + return nullptr; + } + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.NewByteArray(); } catch (ROCKSDB_NAMESPACE::KVException&) { return nullptr; @@ -1538,10 +1555,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIIJ( try { ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env); - ROCKSDB_NAMESPACE::KVException::ThrowOnError( - env, db->Get(*reinterpret_cast( - jropt_handle), - cf_handle, key.slice(), &value.pinnable_slice())); + ROCKSDB_NAMESPACE::Status s = db->Get( + *reinterpret_cast(jropt_handle), + cf_handle, key.slice(), &value.pinnable_slice()); + if (s.IsNotFound()) { + return nullptr; + } + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.NewByteArray(); } catch (ROCKSDB_NAMESPACE::KVException&) { return nullptr; @@ -1563,10 +1583,13 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jclass, ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, jval_len); - ROCKSDB_NAMESPACE::KVException::ThrowOnError( - env, + ROCKSDB_NAMESPACE::Status s = db->Get(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(), - key.slice(), &value.pinnable_slice())); + key.slice(), &value.pinnable_slice()); + if (s.IsNotFound()) { + return ROCKSDB_NAMESPACE::KVException::kNotFound; + } + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.Fetch(); } catch (ROCKSDB_NAMESPACE::KVException& e) { @@ -1595,9 +1618,13 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jclass, ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, jval_len); - ROCKSDB_NAMESPACE::KVException::ThrowOnError( - env, db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(), - &value.pinnable_slice())); + ROCKSDB_NAMESPACE::Status s = + db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(), + &value.pinnable_slice()); + if (s.IsNotFound()) { + return ROCKSDB_NAMESPACE::KVException::kNotFound; + } + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.Fetch(); } catch (ROCKSDB_NAMESPACE::KVException& e) { @@ -1621,11 +1648,13 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jclass, ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, jval_len); - ROCKSDB_NAMESPACE::KVException::ThrowOnError( - env, - db->Get( - *reinterpret_cast(jropt_handle), - db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice())); + ROCKSDB_NAMESPACE::Status s = db->Get( + *reinterpret_cast(jropt_handle), + db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice()); + if (s.IsNotFound()) { + return ROCKSDB_NAMESPACE::KVException::kNotFound; + } + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.Fetch(); } catch (ROCKSDB_NAMESPACE::KVException& e) { @@ -1652,10 +1681,13 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ( ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len); ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off, jval_len); - ROCKSDB_NAMESPACE::KVException::ThrowOnError( - env, db->Get(*reinterpret_cast( - jropt_handle), - cf_handle, key.slice(), &value.pinnable_slice())); + ROCKSDB_NAMESPACE::Status s = db->Get( + *reinterpret_cast(jropt_handle), + cf_handle, key.slice(), &value.pinnable_slice()); + if (s.IsNotFound()) { + return ROCKSDB_NAMESPACE::KVException::kNotFound; + } + ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s); return value.Fetch(); } catch (ROCKSDB_NAMESPACE::KVException& e) { @@ -2951,6 +2983,28 @@ void Java_org_rocksdb_RocksDB_continueBackgroundWork(JNIEnv* env, jclass, } } +/* + * Class: org_rocksdb_RocksDB + * Method: abortAllCompactions + * Signature: (J)V + */ +void Java_org_rocksdb_RocksDB_abortAllCompactions(JNIEnv*, jclass, + jlong jdb_handle) { + auto* db = reinterpret_cast(jdb_handle); + db->AbortAllCompactions(); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: resumeAllCompactions + * Signature: (J)V + */ +void Java_org_rocksdb_RocksDB_resumeAllCompactions(JNIEnv*, jclass, + jlong jdb_handle) { + auto* db = reinterpret_cast(jdb_handle); + db->ResumeAllCompactions(); +} + /* * Class: org_rocksdb_RocksDB * Method: enableAutoCompaction @@ -2996,17 +3050,9 @@ jint Java_org_rocksdb_RocksDB_numberLevels(JNIEnv*, jclass, jlong jdb_handle, * Signature: (JJ)I */ jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(JNIEnv*, jclass, - jlong jdb_handle, - jlong jcf_handle) { - auto* db = reinterpret_cast(jdb_handle); - ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; - if (jcf_handle == 0) { - cf_handle = db->DefaultColumnFamily(); - } else { - cf_handle = - reinterpret_cast(jcf_handle); - } - return static_cast(db->MaxMemCompactionLevel(cf_handle)); + jlong /*jdb_handle*/, + jlong /*jcf_handle*/) { + return 0; } /* @@ -3637,7 +3683,7 @@ void Java_org_rocksdb_RocksDB_destroyDB(JNIEnv* env, jclass, jstring jdb_path, } bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index, - std::unique_ptr& slice, + ROCKSDB_NAMESPACE::OptSlice& opt_slice, std::vector>& ranges_to_free) { jobject jArray = env->GetObjectArrayElement(ranges, index); if (env->ExceptionCheck()) { @@ -3659,8 +3705,8 @@ bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index, return false; } env->DeleteLocalRef(jArray); - slice.reset(new ROCKSDB_NAMESPACE::Slice( - reinterpret_cast(ranges_to_free.back().get()), len_ba)); + opt_slice = ROCKSDB_NAMESPACE::Slice( + reinterpret_cast(ranges_to_free.back().get()), len_ba); return true; } /* @@ -3675,24 +3721,24 @@ void Java_org_rocksdb_RocksDB_deleteFilesInRanges(JNIEnv* env, jclass /*jdb*/, jboolean include_end) { jsize length = env->GetArrayLength(ranges); - std::vector rangesVector; - std::vector> slices; + std::vector rangesVector; + std::vector slices; std::vector> ranges_to_free; for (jsize i = 0; (i + 1) < length; i += 2) { - slices.push_back(std::unique_ptr()); + slices.emplace_back(); if (!get_slice_helper(env, ranges, i, slices.back(), ranges_to_free)) { // exception thrown return; } - slices.push_back(std::unique_ptr()); + slices.emplace_back(); if (!get_slice_helper(env, ranges, i + 1, slices.back(), ranges_to_free)) { // exception thrown return; } - rangesVector.push_back(ROCKSDB_NAMESPACE::RangePtr( - slices[slices.size() - 2].get(), slices[slices.size() - 1].get())); + rangesVector.push_back(ROCKSDB_NAMESPACE::RangeOpt( + slices[slices.size() - 2], slices[slices.size() - 1])); } auto* db = reinterpret_cast(jdb_handle); diff --git a/java/rocksjni/sst_file_readerjni.cc b/java/rocksjni/sst_file_readerjni.cc index 4af472ecfb1c..c0370b1d64d8 100644 --- a/java/rocksjni/sst_file_readerjni.cc +++ b/java/rocksjni/sst_file_readerjni.cc @@ -24,12 +24,11 @@ * Method: newSstFileReader * Signature: (J)J */ -jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/, +jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv* /*env*/, jclass /*jcls*/, jlong joptions) { - auto *options = - reinterpret_cast(joptions); - ROCKSDB_NAMESPACE::SstFileReader *sst_file_reader = + auto* options = reinterpret_cast(joptions); + ROCKSDB_NAMESPACE::SstFileReader* sst_file_reader = new ROCKSDB_NAMESPACE::SstFileReader(*options); return GET_CPLUSPLUS_POINTER(sst_file_reader); } @@ -39,15 +38,15 @@ jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/, * Method: open * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileReader_open(JNIEnv* env, jclass /*jcls*/, jlong jhandle, jstring jfile_path) { - const char *file_path = env->GetStringUTFChars(jfile_path, nullptr); + const char* file_path = env->GetStringUTFChars(jfile_path, nullptr); if (file_path == nullptr) { // exception thrown: OutOfMemoryError return; } ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Open( + reinterpret_cast(jhandle)->Open( file_path); env->ReleaseStringUTFChars(jfile_path, file_path); @@ -61,13 +60,13 @@ void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jclass /*jcls*/, * Method: newIterator * Signature: (JJ)J */ -jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/, +jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle, jlong jread_options_handle) { - auto *sst_file_reader = - reinterpret_cast(jhandle); - auto *read_options = - reinterpret_cast(jread_options_handle); + auto* sst_file_reader = + reinterpret_cast(jhandle); + auto* read_options = + reinterpret_cast(jread_options_handle); return GET_CPLUSPLUS_POINTER(sst_file_reader->NewIterator(*read_options)); } @@ -76,10 +75,10 @@ jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv * /*env*/, +void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv* /*env*/, jclass /*jcls*/, jlong jhandle) { - delete reinterpret_cast(jhandle); + delete reinterpret_cast(jhandle); } /* @@ -87,10 +86,10 @@ void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv * /*env*/, * Method: verifyChecksum * Signature: (J)V */ -void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv* env, jclass /*jcls*/, jlong jhandle) { - auto *sst_file_reader = - reinterpret_cast(jhandle); + auto* sst_file_reader = + reinterpret_cast(jhandle); auto s = sst_file_reader->VerifyChecksum(); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); @@ -102,11 +101,11 @@ void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, jclass /*jcls*/, * Method: getTableProperties * Signature: (J)J */ -jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env, +jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv* env, jclass /*jcls*/, jlong jhandle) { - auto *sst_file_reader = - reinterpret_cast(jhandle); + auto* sst_file_reader = + reinterpret_cast(jhandle); std::shared_ptr tp = sst_file_reader->GetTableProperties(); jobject jtable_properties = diff --git a/java/rocksjni/sst_file_writerjni.cc b/java/rocksjni/sst_file_writerjni.cc index 481adbc85640..fbe888ab01b3 100644 --- a/java/rocksjni/sst_file_writerjni.cc +++ b/java/rocksjni/sst_file_writerjni.cc @@ -25,27 +25,26 @@ * Signature: (JJJB)J */ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB( - JNIEnv * /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions, + JNIEnv* /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions, jlong jcomparator_handle, jbyte jcomparator_type) { - ROCKSDB_NAMESPACE::Comparator *comparator = nullptr; + ROCKSDB_NAMESPACE::Comparator* comparator = nullptr; switch (jcomparator_type) { // JAVA_COMPARATOR case 0x0: - comparator = reinterpret_cast( + comparator = reinterpret_cast( jcomparator_handle); break; // JAVA_NATIVE_COMPARATOR_WRAPPER case 0x1: comparator = - reinterpret_cast(jcomparator_handle); + reinterpret_cast(jcomparator_handle); break; } - auto *env_options = - reinterpret_cast(jenvoptions); - auto *options = - reinterpret_cast(joptions); - ROCKSDB_NAMESPACE::SstFileWriter *sst_file_writer = + auto* env_options = + reinterpret_cast(jenvoptions); + auto* options = reinterpret_cast(joptions); + ROCKSDB_NAMESPACE::SstFileWriter* sst_file_writer = new ROCKSDB_NAMESPACE::SstFileWriter(*env_options, *options, comparator); return GET_CPLUSPLUS_POINTER(sst_file_writer); } @@ -55,15 +54,14 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB( * Method: newSstFileWriter * Signature: (JJ)J */ -jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/, +jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv* /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions) { - auto *env_options = - reinterpret_cast(jenvoptions); - auto *options = - reinterpret_cast(joptions); - ROCKSDB_NAMESPACE::SstFileWriter *sst_file_writer = + auto* env_options = + reinterpret_cast(jenvoptions); + auto* options = reinterpret_cast(joptions); + ROCKSDB_NAMESPACE::SstFileWriter* sst_file_writer = new ROCKSDB_NAMESPACE::SstFileWriter(*env_options, *options); return GET_CPLUSPLUS_POINTER(sst_file_writer); } @@ -73,15 +71,15 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/, * Method: open * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_open(JNIEnv* env, jclass /*jcls*/, jlong jhandle, jstring jfile_path) { - const char *file_path = env->GetStringUTFChars(jfile_path, nullptr); + const char* file_path = env->GetStringUTFChars(jfile_path, nullptr); if (file_path == nullptr) { // exception thrown: OutOfMemoryError return; } ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Open( + reinterpret_cast(jhandle)->Open( file_path); env->ReleaseStringUTFChars(jfile_path, file_path); @@ -95,14 +93,14 @@ void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jclass /*jcls*/, * Method: put * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv* env, jclass /*jcls*/, jlong jhandle, jlong jkey_handle, jlong jvalue_handle) { - auto *key_slice = reinterpret_cast(jkey_handle); - auto *value_slice = - reinterpret_cast(jvalue_handle); + auto* key_slice = reinterpret_cast(jkey_handle); + auto* value_slice = + reinterpret_cast(jvalue_handle); ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Put( + reinterpret_cast(jhandle)->Put( *key_slice, *value_slice); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); @@ -114,28 +112,28 @@ void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jclass /*jcls*/, * Method: put * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv* env, jclass /*jcls*/, jlong jhandle, jbyteArray jkey, jbyteArray jval) { - jbyte *key = env->GetByteArrayElements(jkey, nullptr); + jbyte* key = env->GetByteArrayElements(jkey, nullptr); if (key == nullptr) { // exception thrown: OutOfMemoryError return; } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), env->GetArrayLength(jkey)); - jbyte *value = env->GetByteArrayElements(jval, nullptr); + jbyte* value = env->GetByteArrayElements(jval, nullptr); if (value == nullptr) { // exception thrown: OutOfMemoryError env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); return; } - ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), + ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), env->GetArrayLength(jval)); ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Put( + reinterpret_cast(jhandle)->Put( key_slice, value_slice); env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); @@ -151,15 +149,15 @@ void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jclass /*jcls*/, * Method: putDirect * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)V */ -void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv* env, jclass /*jcls*/, jlong jdb_handle, jobject jkey, jint jkey_off, jint jkey_len, jobject jval, jint jval_off, jint jval_len) { - auto *writer = - reinterpret_cast(jdb_handle); - auto put = [&env, &writer](ROCKSDB_NAMESPACE::Slice &key, - ROCKSDB_NAMESPACE::Slice &value) { + auto* writer = + reinterpret_cast(jdb_handle); + auto put = [&env, &writer](ROCKSDB_NAMESPACE::Slice& key, + ROCKSDB_NAMESPACE::Slice& value) { ROCKSDB_NAMESPACE::Status s = writer->Put(key, value); if (s.ok()) { return; @@ -175,10 +173,10 @@ void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jclass /*jcls*/, * Method: fileSize * Signature: (J)J */ -jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jclass /*jcls*/, +jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv* /*env*/, jclass /*jcls*/, jlong jdb_handle) { - auto *writer = - reinterpret_cast(jdb_handle); + auto* writer = + reinterpret_cast(jdb_handle); return static_cast(writer->FileSize()); } @@ -187,14 +185,14 @@ jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jclass /*jcls*/, * Method: merge * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv* env, jclass /*jcls*/, jlong jhandle, jlong jkey_handle, jlong jvalue_handle) { - auto *key_slice = reinterpret_cast(jkey_handle); - auto *value_slice = - reinterpret_cast(jvalue_handle); + auto* key_slice = reinterpret_cast(jkey_handle); + auto* value_slice = + reinterpret_cast(jvalue_handle); ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Merge( + reinterpret_cast(jhandle)->Merge( *key_slice, *value_slice); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); @@ -206,29 +204,29 @@ void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jclass /*jcls*/, * Method: merge * Signature: (J[B[B)V */ -void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv* env, jclass /*jcls*/, jlong jhandle, jbyteArray jkey, jbyteArray jval) { - jbyte *key = env->GetByteArrayElements(jkey, nullptr); + jbyte* key = env->GetByteArrayElements(jkey, nullptr); if (key == nullptr) { // exception thrown: OutOfMemoryError return; } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), env->GetArrayLength(jkey)); - jbyte *value = env->GetByteArrayElements(jval, nullptr); + jbyte* value = env->GetByteArrayElements(jval, nullptr); if (value == nullptr) { // exception thrown: OutOfMemoryError env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); return; } - ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), + ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), env->GetArrayLength(jval)); ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Merge( + reinterpret_cast(jhandle)->Merge( key_slice, value_slice); env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); @@ -244,19 +242,19 @@ void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jclass /*jcls*/, * Method: delete * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv* env, jclass /*jcls*/, jlong jhandle, jbyteArray jkey) { - jbyte *key = env->GetByteArrayElements(jkey, nullptr); + jbyte* key = env->GetByteArrayElements(jkey, nullptr); if (key == nullptr) { // exception thrown: OutOfMemoryError return; } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), env->GetArrayLength(jkey)); ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Delete( + reinterpret_cast(jhandle)->Delete( key_slice); env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); @@ -271,12 +269,12 @@ void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jclass /*jcls*/, * Method: delete * Signature: (JJJ)V */ -void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv* env, jclass /*jcls*/, jlong jhandle, jlong jkey_handle) { - auto *key_slice = reinterpret_cast(jkey_handle); + auto* key_slice = reinterpret_cast(jkey_handle); ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Delete( + reinterpret_cast(jhandle)->Delete( *key_slice); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); @@ -288,10 +286,10 @@ void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jclass /*jcls*/, * Method: finish * Signature: (J)V */ -void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jclass /*jcls*/, +void Java_org_rocksdb_SstFileWriter_finish(JNIEnv* env, jclass /*jcls*/, jlong jhandle) { ROCKSDB_NAMESPACE::Status s = - reinterpret_cast(jhandle)->Finish(); + reinterpret_cast(jhandle)->Finish(); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } @@ -302,8 +300,8 @@ void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jclass /*jcls*/, * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_SstFileWriter_disposeInternalJni(JNIEnv * /*env*/, +void Java_org_rocksdb_SstFileWriter_disposeInternalJni(JNIEnv* /*env*/, jclass /*jobj*/, jlong jhandle) { - delete reinterpret_cast(jhandle); + delete reinterpret_cast(jhandle); } diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index eb5de1695e6c..064a5b1a7fac 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -23,7 +23,7 @@ * Signature: (IIDIIBZZ)J */ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( - JNIEnv * /*env*/, jclass /*jcls*/, jint jkey_size, jint jbloom_bits_per_key, + JNIEnv* /*env*/, jclass /*jcls*/, jint jkey_size, jint jbloom_bits_per_key, jdouble jhash_table_ratio, jint jindex_sparseness, jint jhuge_page_tlb_size, jbyte jencoding_type, jboolean jfull_scan_mode, jboolean jstore_index_in_file) { @@ -45,10 +45,10 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( /* * Class: org_rocksdb_BlockBasedTableConfig * Method: newTableFactoryHandle - * Signature: (ZZZZBBDBZJJJJIIIJZZZJZZIIZZBJIJI)J + * Signature: (ZZZZBBDBZJJJIIIJZZZJZZIIZZJJBBJI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( - JNIEnv *, jclass, jboolean jcache_index_and_filter_blocks, + JNIEnv*, jclass, jboolean jcache_index_and_filter_blocks, jboolean jcache_index_and_filter_blocks_with_high_priority, jboolean jpin_l0_filter_and_index_blocks_in_cache, jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value, @@ -63,7 +63,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( jboolean jwhole_key_filtering, jboolean jverify_compression, jint jread_amp_bytes_per_bit, jint jformat_version, jboolean jenable_index_compression, jboolean jblock_align, - jbyte jindex_shortening, jlong jblock_cache_size, + jlong jsuper_block_alignment_size, + jlong jsuper_block_alignment_space_overhead_ratio, jbyte jindex_shortening, + jbyte jindex_search_type, jlong jblock_cache_size, jint jblock_cache_num_shard_bits) { ROCKSDB_NAMESPACE::BlockBasedTableOptions options; options.cache_index_and_filter_blocks = @@ -88,8 +90,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.block_cache = nullptr; } else { if (jblock_cache_handle > 0) { - std::shared_ptr *pCache = - reinterpret_cast *>( + std::shared_ptr* pCache = + reinterpret_cast*>( jblock_cache_handle); options.block_cache = *pCache; } else if (jblock_cache_size >= 0) { @@ -107,8 +109,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( } } if (jpersistent_cache_handle > 0) { - std::shared_ptr *pCache = - reinterpret_cast *>( + std::shared_ptr* pCache = + reinterpret_cast*>( jpersistent_cache_handle); options.persistent_cache = *pCache; } @@ -123,8 +125,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( static_cast(joptimize_filters_for_memory); options.use_delta_encoding = static_cast(juse_delta_encoding); if (jfilter_policy_handle > 0) { - std::shared_ptr *pFilterPolicy = - reinterpret_cast *>( + std::shared_ptr* pFilterPolicy = + reinterpret_cast*>( jfilter_policy_handle); options.filter_policy = *pFilterPolicy; } @@ -136,9 +138,16 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.enable_index_compression = static_cast(jenable_index_compression); options.block_align = static_cast(jblock_align); + options.super_block_alignment_size = + static_cast(jsuper_block_alignment_size); + options.super_block_alignment_space_overhead_ratio = + static_cast(jsuper_block_alignment_space_overhead_ratio); options.index_shortening = ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode( jindex_shortening); + options.index_block_search_type = + ROCKSDB_NAMESPACE::IndexSearchTypeJni::toCppIndexSearchType( + jindex_search_type); return GET_CPLUSPLUS_POINTER( ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(options)); diff --git a/java/rocksjni/table_properties_collector_factory.cc b/java/rocksjni/table_properties_collector_factory.cc index 60e1df6e8b13..365a50d7eb5a 100644 --- a/java/rocksjni/table_properties_collector_factory.cc +++ b/java/rocksjni/table_properties_collector_factory.cc @@ -17,9 +17,9 @@ * Signature: (JJD)J */ jlong Java_org_rocksdb_TablePropertiesCollectorFactory_newCompactOnDeletionCollectorFactory( - JNIEnv *, jclass, jlong sliding_window_size, jlong deletion_trigger, + JNIEnv*, jclass, jlong sliding_window_size, jlong deletion_trigger, jdouble deletion_ratio) { - auto *wrapper = new TablePropertiesCollectorFactoriesJniWrapper(); + auto* wrapper = new TablePropertiesCollectorFactoriesJniWrapper(); wrapper->table_properties_collector_factories = ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory( sliding_window_size, deletion_trigger, deletion_ratio); @@ -32,8 +32,8 @@ jlong Java_org_rocksdb_TablePropertiesCollectorFactory_newCompactOnDeletionColle * Signature: (J)J */ void Java_org_rocksdb_TablePropertiesCollectorFactory_deleteCompactOnDeletionCollectorFactory( - JNIEnv *, jclass, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto instance = - reinterpret_cast(jhandle); + reinterpret_cast(jhandle); delete instance; } diff --git a/java/rocksjni/testable_event_listener.cc b/java/rocksjni/testable_event_listener.cc index 483ade160561..febf8cbd1bb7 100644 --- a/java/rocksjni/testable_event_listener.cc +++ b/java/rocksjni/testable_event_listener.cc @@ -78,9 +78,9 @@ static TableProperties newTablePropertiesForTest() { * Signature: (J)V */ void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks( - JNIEnv *, jclass, jlong jhandle) { - const auto &el = - *reinterpret_cast *>( + JNIEnv*, jclass, jlong jhandle) { + const auto& el = + *reinterpret_cast*>( jhandle); TableProperties table_properties = newTablePropertiesForTest(); @@ -127,7 +127,7 @@ void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks( compaction_job_info.output_file_infos = {}; compaction_job_info.table_properties = { {"tableProperties", std::shared_ptr( - &table_properties, [](TableProperties *) {})}}; + &table_properties, [](TableProperties*) {})}}; compaction_job_info.compaction_reason = CompactionReason::kFlush; compaction_job_info.compression = CompressionType::kSnappyCompression; diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc index e211ebe5d6dd..f457ef331c54 100644 --- a/java/rocksjni/transaction.cc +++ b/java/rocksjni/transaction.cc @@ -341,6 +341,36 @@ jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B( statuses); } +/* + * Class: org_rocksdb_Transaction + * Method: multiGet + * Signature: (JJJ[[B)[[B + */ +jobjectArray Java_org_rocksdb_Transaction_multiGet__JJJ_3_3B( + JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jread_options_handle, + jlong jcf_handle, jobjectArray jkeys) { + ROCKSDB_NAMESPACE::MultiGetJNIKeys keys; + if (!keys.fromByteArrays(env, jkeys)) { + return nullptr; + } + + auto* txn = reinterpret_cast(jhandle); + auto* cf_handle = + reinterpret_cast(jcf_handle); + + size_t num_keys = keys.size(); + std::vector values(num_keys); + std::vector statuses(num_keys); + + txn->MultiGet( + *reinterpret_cast(jread_options_handle), + cf_handle, num_keys, keys.slices().data(), values.data(), statuses.data(), + /*sorted_input=*/false); + + return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays< + ROCKSDB_NAMESPACE::PinnableSlice>(env, values, statuses); +} + /* * Class: org_rocksdb_Transaction * Method: getForUpdate diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java index d1d1123dded4..867f5ca959bd 100644 --- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java @@ -44,53 +44,6 @@ T setMinWriteBufferNumberToMerge( */ int minWriteBufferNumberToMerge(); - /** - * The total maximum number of write buffers to maintain in memory including - * copies of buffers that have already been flushed. Unlike - * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}, - * this parameter does not affect flushing. - * This controls the minimum amount of write history that will be available - * in memory for conflict checking when Transactions are used. - *

- * When using an OptimisticTransactionDB: - * If this value is too low, some transactions may fail at commit time due - * to not being able to determine whether there were any write conflicts. - *

- * When using a TransactionDB: - * If Transaction::SetSnapshot is used, TransactionDB will read either - * in-memory write buffers or SST files to do write-conflict checking. - * Increasing this value can reduce the number of reads to SST files - * done for conflict detection. - *

- * Setting this value to 0 will cause write buffers to be freed immediately - * after they are flushed. - * If this value is set to -1, - * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()} - * will be used. - *

- * Default: - * If using a TransactionDB/OptimisticTransactionDB, the default value will - * be set to the value of - * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()} - * if it is not explicitly set by the user. Otherwise, the default is 0. - * - * @param maxWriteBufferNumberToMaintain The maximum number of write - * buffers to maintain - * - * @return the reference to the current options. - */ - T setMaxWriteBufferNumberToMaintain( - int maxWriteBufferNumberToMaintain); - - /** - * The total maximum number of write buffers to maintain in memory including - * copies of buffers that have already been flushed. - * - * @return maxWriteBufferNumberToMaintain The maximum number of write buffers - * to maintain - */ - int maxWriteBufferNumberToMaintain(); - /** * Allows thread-safe inplace updates. * If inplace_callback function is not set, diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index c8159db2ddca..555f54f3b748 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -37,10 +37,13 @@ public BlockBasedTableConfig() { wholeKeyFiltering = true; verifyCompression = false; readAmpBytesPerBit = 0; - formatVersion = 6; + formatVersion = 7; enableIndexCompression = true; blockAlign = false; + superBlockAlignmentSize = 0; + superBlockAlignmentSpaceOverheadRatio = 128; indexShortening = IndexShorteningMode.kShortenSeparators; + indexSearchType = IndexSearchType.kBinary; // NOTE: ONLY used if blockCache == null blockCacheSize = 8 * 1024 * 1024; @@ -60,9 +63,10 @@ private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks, final boolean partitionFilters, final boolean optimizeFiltersForMemory, final boolean useDeltaEncoding, final boolean wholeKeyFiltering, final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion, - final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening, - final byte filterPolicyType, final long filterPolicyHandle, - final double filterPolicyConfigValue) { + final boolean enableIndexCompression, final boolean blockAlign, + final long superBlockAlignmentSize, final long superBlockAlignmentSpaceOverheadRatio, + final byte indexShortening, final byte indexSearchType, final byte filterPolicyType, + final long filterPolicyHandle, final double filterPolicyConfigValue) { this.cacheIndexAndFilterBlocks = cacheIndexAndFilterBlocks; this.cacheIndexAndFilterBlocksWithHighPriority = cacheIndexAndFilterBlocksWithHighPriority; this.pinL0FilterAndIndexBlocksInCache = pinL0FilterAndIndexBlocksInCache; @@ -86,7 +90,10 @@ private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks, this.formatVersion = formatVersion; this.enableIndexCompression = enableIndexCompression; this.blockAlign = blockAlign; + this.superBlockAlignmentSize = superBlockAlignmentSize; + this.superBlockAlignmentSpaceOverheadRatio = superBlockAlignmentSpaceOverheadRatio; this.indexShortening = IndexShorteningMode.values()[indexShortening]; + this.indexSearchType = IndexSearchType.values()[indexSearchType]; try (Filter filterPolicy = FilterPolicyType.values()[filterPolicyType].createFilter( filterPolicyHandle, filterPolicyConfigValue)) { if (filterPolicy != null) { @@ -799,6 +806,50 @@ public BlockBasedTableConfig setBlockAlign(final boolean blockAlign) { return this; } + /** + * Get the super block alignment size. + * + * @return the super block alignment size. + */ + public long superBlockAlignmentSize() { + return superBlockAlignmentSize; + } + + /** + * Set the super block alignment size. + * When set to 0, super block alignment is disabled. + * + * @param superBlockAlignmentSize the super block alignment size. + * + * @return the reference to the current option. + */ + public BlockBasedTableConfig setSuperBlockAlignmentSize(final long superBlockAlignmentSize) { + this.superBlockAlignmentSize = superBlockAlignmentSize; + return this; + } + + /** + * Get the space overhead ratio of super block alignment. + * + * @return space overhead ratio of super block alignment. + */ + public long superBlockAlignmentSpaceOverheadRatio() { + return superBlockAlignmentSpaceOverheadRatio; + } + + /** + * Set the space overhead ratio of super block alignment. + * + * @param superBlockAlignmentSpaceOverheadRatio the space overhead ratio of super block alignment. + * + * @return the reference to the current option. + */ + public BlockBasedTableConfig setSuperBlockAlignmentSpaceOverheadRatio( + final long superBlockAlignmentSpaceOverheadRatio) { + this.superBlockAlignmentSpaceOverheadRatio = superBlockAlignmentSpaceOverheadRatio; + return this; + } + /** * Get the index shortening mode. * @@ -822,6 +873,26 @@ public BlockBasedTableConfig setIndexShortening(final IndexShorteningMode indexS return this; } + /** + * Get the index search type. + * + * @return the currently set index search type + */ + public IndexSearchType indexSearchType() { + return indexSearchType; + } + + /** + * Sets the index search type to used with this table. + * + * @param indexSearchType {@link org.rocksdb.IndexSearchType} value + * @return the reference to the current option. + */ + public BlockBasedTableConfig setIndexSearchType(final IndexSearchType indexSearchType) { + this.indexSearchType = indexSearchType; + return this; + } + /** * Get the size of the cache in bytes that will be used by RocksDB. * @@ -946,7 +1017,8 @@ public BlockBasedTableConfig setHashIndexAllowCollision( indexBlockRestartInterval, metadataBlockSize, partitionFilters, optimizeFiltersForMemory, useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, verifyCompression, readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign, - indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits); + superBlockAlignmentSize, superBlockAlignmentSpaceOverheadRatio, indexShortening.getValue(), + indexSearchType.getValue(), blockCacheSize, blockCacheNumShardBits); } private static native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks, @@ -961,7 +1033,9 @@ private static native long newTableFactoryHandle(final boolean cacheIndexAndFilt final boolean useDeltaEncoding, final long filterPolicyHandle, final boolean wholeKeyFiltering, final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion, final boolean enableIndexCompression, - final boolean blockAlign, final byte indexShortening, + final boolean blockAlign, final long superBlockAlignmentSize, + final long superBlockAlignmentSpaceOverheadRatio, final byte indexShortening, + final byte indexSearchType, @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits); @@ -992,7 +1066,10 @@ private static native long newTableFactoryHandle(final boolean cacheIndexAndFilt private int formatVersion; private boolean enableIndexCompression; private boolean blockAlign; + private long superBlockAlignmentSize; + private long superBlockAlignmentSpaceOverheadRatio; private IndexShorteningMode indexShortening; + private IndexSearchType indexSearchType; // NOTE: ONLY used if blockCache == null @Deprecated private long blockCacheSize; diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java index 3af4d2a8ed6f..d25f8c73bc7b 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -835,19 +835,6 @@ public boolean paranoidFileChecks() { return paranoidFileChecks(nativeHandle_); } - @Override - public ColumnFamilyOptions setMaxWriteBufferNumberToMaintain( - final int maxWriteBufferNumberToMaintain) { - setMaxWriteBufferNumberToMaintain( - nativeHandle_, maxWriteBufferNumberToMaintain); - return this; - } - - @Override - public int maxWriteBufferNumberToMaintain() { - return maxWriteBufferNumberToMaintain(nativeHandle_); - } - @Override public ColumnFamilyOptions setCompactionPriority( final CompactionPriority compactionPriority) { @@ -1467,9 +1454,6 @@ private static native void setMaxBytesForLevelMultiplierAdditional( private static native int[] maxBytesForLevelMultiplierAdditional(long handle); private static native void setParanoidFileChecks(long handle, boolean paranoidFileChecks); private static native boolean paranoidFileChecks(long handle); - private static native void setMaxWriteBufferNumberToMaintain( - final long handle, final int maxWriteBufferNumberToMaintain); - private static native int maxWriteBufferNumberToMaintain(final long handle); private static native void setCompactionPriority( final long handle, final byte compactionPriority); private static native byte compactionPriority(final long handle); diff --git a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java index 24ebe0da2ff1..3d94e7eb0215 100644 --- a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java +++ b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java @@ -75,6 +75,51 @@ public boolean allowCompaction() { return allowCompaction(nativeHandle_); } + /** + * Combined SST + blob file size limit for FIFO compaction trimming. + * When non-zero, FIFO uses total_sst + total_blob for size-based dropping. + * When zero (default), uses max_table_files_size (SST-only). + * + * @param maxDataFilesSize the combined size limit in bytes + * + * @return the reference to the current options. + */ + public CompactionOptionsFIFO setMaxDataFilesSize(final long maxDataFilesSize) { + setMaxDataFilesSize(nativeHandle_, maxDataFilesSize); + return this; + } + + /** + * Get the combined SST + blob file size limit. + * + * @return max data files size in bytes, 0 means disabled + */ + public long maxDataFilesSize() { + return maxDataFilesSize(nativeHandle_); + } + + /** + * Enable capacity-derived intra-L0 compaction using the observed key/value + * size ratio. Requires maxDataFilesSize > 0. + * + * @param useKvRatioCompaction true to enable + * + * @return the reference to the current options. + */ + public CompactionOptionsFIFO setUseKvRatioCompaction(final boolean useKvRatioCompaction) { + setUseKvRatioCompaction(nativeHandle_, useKvRatioCompaction); + return this; + } + + /** + * Check if capacity-derived intra-L0 compaction is enabled. + * + * @return true if enabled + */ + public boolean useKvRatioCompaction() { + return useKvRatioCompaction(nativeHandle_); + } + private static native long newCompactionOptionsFIFO(); @Override protected final void disposeInternal(final long handle) { @@ -86,4 +131,9 @@ protected final void disposeInternal(final long handle) { private static native long maxTableFilesSize(final long handle); private static native void setAllowCompaction(final long handle, final boolean allowCompaction); private static native boolean allowCompaction(final long handle); + private static native void setMaxDataFilesSize(final long handle, final long maxDataFilesSize); + private static native long maxDataFilesSize(final long handle); + private static native void setUseKvRatioCompaction( + final long handle, final boolean useKvRatioCompaction); + private static native boolean useKvRatioCompaction(final long handle); } diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java index 0221a63fba07..12f5d4913c2f 100644 --- a/java/src/main/java/org/rocksdb/DBOptions.java +++ b/java/src/main/java/org/rocksdb/DBOptions.java @@ -962,19 +962,6 @@ public boolean skipStatsUpdateOnDbOpen() { return skipStatsUpdateOnDbOpen(nativeHandle_); } - @Override - public DBOptions setSkipCheckingSstFileSizesOnDbOpen( - final boolean skipCheckingSstFileSizesOnDbOpen) { - setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen); - return this; - } - - @Override - public boolean skipCheckingSstFileSizesOnDbOpen() { - assert (isOwningHandle()); - return skipCheckingSstFileSizesOnDbOpen(nativeHandle_); - } - @Override public DBOptions setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) { assert(isOwningHandle()); @@ -1389,9 +1376,6 @@ private static native void setWriteThreadSlowYieldUsec( private static native void setSkipStatsUpdateOnDbOpen( final long handle, final boolean skipStatsUpdateOnDbOpen); private static native boolean skipStatsUpdateOnDbOpen(final long handle); - private static native void setSkipCheckingSstFileSizesOnDbOpen( - final long handle, final boolean skipChecking); - private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle); private static native void setWalRecoveryMode(final long handle, final byte walRecoveryMode); private static native byte walRecoveryMode(final long handle); private static native void setAllow2pc(final long handle, final boolean allow2pc); diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java index bc9d9acbd65e..f40fc1a25cfe 100644 --- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -1214,36 +1214,6 @@ T setEnableWriteThreadAdaptiveYield( */ boolean skipStatsUpdateOnDbOpen(); - /** - * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files. - * This may significantly speed up startup if there are many sst files, - * especially when using non-default Env with expensive GetFileSize(). - * We'll still check that all required sst files exist. - * If {@code paranoid_checks} is false, this option is ignored, and sst files are - * not checked at all. - * - * Default: false - * - * @param skipCheckingSstFileSizesOnDbOpen if true, then SST file sizes will not be checked - * when calling {@link RocksDB#open(String)}. - * @return the reference to the current options. - */ - T setSkipCheckingSstFileSizesOnDbOpen(final boolean skipCheckingSstFileSizesOnDbOpen); - - /** - * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files. - * This may significantly speed up startup if there are many sst files, - * especially when using non-default Env with expensive GetFileSize(). - * We'll still check that all required sst files exist. - * If {@code paranoid_checks} is false, this option is ignored, and sst files are - * not checked at all. - * - * Default: false - * - * @return true, if file sizes will not be checked when calling {@link RocksDB#open(String)}. - */ - boolean skipCheckingSstFileSizesOnDbOpen(); - /** * Recovery mode to control the consistency while replaying WAL * diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 10d382e7b912..b4a56cc07e0d 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -210,7 +210,23 @@ public enum HistogramType { */ TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x3D), - // 0x3E for backwards compatibility on current minor version. + COMPACTION_PREFETCH_BYTES((byte) 0x3F), + + /** + * MultiScan histogram statistics + */ + + /** + * Time spent in Iterator::Prepare() for multi-scan (microseconds) + */ + MULTISCAN_PREPARE_MICROS((byte) 0x40), + + /** + * Number of blocks per multi-scan Prepare() call + */ + MULTISCAN_BLOCKS_PER_PREPARE((byte) 0x41), + + // 0x3E is reserved for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x3E); private final byte value; diff --git a/java/src/main/java/org/rocksdb/IndexSearchType.java b/java/src/main/java/org/rocksdb/IndexSearchType.java new file mode 100644 index 000000000000..55ec0eef3820 --- /dev/null +++ b/java/src/main/java/org/rocksdb/IndexSearchType.java @@ -0,0 +1,34 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * BlockSearchType used in conjunction with BlockBasedTable. + */ +public enum IndexSearchType { + /** + * Standard binary search + */ + kBinary((byte) 0x0), + + /** + * Interpolation search, which may be better suited for uniformly + * distributed keys. Only applicable if the comparator is the + * byte-wise comparator. + */ + kInterpolation((byte) 0x1); + + private final byte value; + + IndexSearchType(final byte value) { + this.value = value; + } + + byte getValue() { + return value; + } +} diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java index 6fe97994d201..aa841c6f3688 100644 --- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java +++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java @@ -30,6 +30,14 @@ public class NativeLibraryLoader { private static final String tempFilePrefix = "librocksdbjni"; private static final String tempFileSuffix = Environment.getJniLibraryExtension(); + /** + * If you set the System Property ROCKS_JAVA_DEBUG_NLL can be to true + * messages about attempts to load the native library will be printed + * to std out. + */ + private static boolean DEBUG_LOADING = + "true".equals(System.getProperty("ROCKS_JAVA_DEBUG_NLL", "false")); + /** * Get a reference to the NativeLibraryLoader * @@ -55,7 +63,7 @@ public static NativeLibraryLoader getInstance() { * * @throws java.io.IOException if a filesystem operation fails. */ - @SuppressWarnings("PMD.EmptyCatchBlock") + @SuppressWarnings({"PMD.EmptyCatchBlock", "PMD.SystemPrintln"}) public synchronized void loadLibrary(final String tmpDir) throws IOException { try { // try dynamic library @@ -63,6 +71,9 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException { return; } catch (final UnsatisfiedLinkError ule) { // ignore - try from static library + if (DEBUG_LOADING) { + System.out.println("Unable to load shared dynamic library: " + sharedLibraryName); + } } try { @@ -71,6 +82,9 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException { return; } catch (final UnsatisfiedLinkError ule) { // ignore - then try static library fallback or from jar + if (DEBUG_LOADING) { + System.out.println("Unable to load shared static library: " + jniLibraryName); + } } if (fallbackJniLibraryName != null) { @@ -80,6 +94,10 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException { return; } catch (final UnsatisfiedLinkError ule) { // ignore - then try from jar + if (DEBUG_LOADING) { + System.out.println( + "Unable to load shared static fallback library: " + fallbackJniLibraryName); + } } } @@ -137,18 +155,23 @@ private File createTemp(final String tmpDir, final String libraryFileName) throw } } - @SuppressWarnings({"PMD.UseProperClassLoader", "PMD.UseTryWithResources"}) + @SuppressWarnings({"PMD.UseProperClassLoader", "PMD.UseTryWithResources", "PMD.SystemPrintln"}) File loadLibraryFromJarToTemp(final String tmpDir) throws IOException { try (InputStream is = getClass().getClassLoader().getResourceAsStream(jniLibraryFileName)) { if (is != null) { final File temp = createTemp(tmpDir, jniLibraryFileName); Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); return temp; + } else { + if (DEBUG_LOADING) { + System.out.println("Unable to find: " + jniLibraryFileName + " on the classpath"); + } } } if (fallbackJniLibraryFileName == null) { - throw new RuntimeException(fallbackJniLibraryFileName + " was not found inside JAR."); + throw new RuntimeException( + jniLibraryFileName + " was not found inside JAR, and there is no fallback."); } try (InputStream is = @@ -157,10 +180,16 @@ File loadLibraryFromJarToTemp(final String tmpDir) throws IOException { final File temp = createTemp(tmpDir, fallbackJniLibraryFileName); Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); return temp; + } else { + if (DEBUG_LOADING) { + System.out.println( + "Unable to find fallback: " + fallbackJniLibraryFileName + " on the classpath"); + } } } - throw new RuntimeException(jniLibraryFileName + " was not found inside JAR."); + throw new RuntimeException("Neither " + jniLibraryFileName + " or " + fallbackJniLibraryFileName + + " were found inside the JAR, and there is no fallback."); } /** diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index c184e140f602..3e7bf28405e8 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -1045,19 +1045,6 @@ public boolean skipStatsUpdateOnDbOpen() { return skipStatsUpdateOnDbOpen(nativeHandle_); } - @Override - public Options setSkipCheckingSstFileSizesOnDbOpen( - final boolean skipCheckingSstFileSizesOnDbOpen) { - setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen); - return this; - } - - @Override - public boolean skipCheckingSstFileSizesOnDbOpen() { - assert (isOwningHandle()); - return skipCheckingSstFileSizesOnDbOpen(nativeHandle_); - } - @Override public Options setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) { assert(isOwningHandle()); @@ -1762,19 +1749,6 @@ public boolean paranoidFileChecks() { return paranoidFileChecks(nativeHandle_); } - @Override - public Options setMaxWriteBufferNumberToMaintain( - final int maxWriteBufferNumberToMaintain) { - setMaxWriteBufferNumberToMaintain( - nativeHandle_, maxWriteBufferNumberToMaintain); - return this; - } - - @Override - public int maxWriteBufferNumberToMaintain() { - return maxWriteBufferNumberToMaintain(nativeHandle_); - } - @Override public Options setCompactionPriority( final CompactionPriority compactionPriority) { @@ -2296,9 +2270,6 @@ private static native void setWriteThreadSlowYieldUsec( private static native void setSkipStatsUpdateOnDbOpen( final long handle, final boolean skipStatsUpdateOnDbOpen); private static native boolean skipStatsUpdateOnDbOpen(final long handle); - private static native void setSkipCheckingSstFileSizesOnDbOpen( - final long handle, final boolean skipChecking); - private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle); private static native void setWalRecoveryMode(final long handle, final byte walRecoveryMode); private static native byte walRecoveryMode(final long handle); private static native void setAllow2pc(final long handle, final boolean allow2pc); @@ -2443,9 +2414,6 @@ private static native void setMaxBytesForLevelMultiplierAdditional( private static native int[] maxBytesForLevelMultiplierAdditional(long handle); private static native void setParanoidFileChecks(long handle, boolean paranoidFileChecks); private static native boolean paranoidFileChecks(long handle); - private static native void setMaxWriteBufferNumberToMaintain( - final long handle, final int maxWriteBufferNumberToMaintain); - private static native int maxWriteBufferNumberToMaintain(final long handle); private static native void setCompactionPriority( final long handle, final byte compactionPriority); private static native byte compactionPriority(final long handle); diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java index 5ce4a8656d3e..4be053376c61 100644 --- a/java/src/main/java/org/rocksdb/ReadOptions.java +++ b/java/src/main/java/org/rocksdb/ReadOptions.java @@ -186,37 +186,6 @@ public ReadOptions setTailing(final boolean tailing) { return this; } - /** - * Returns whether managed iterators will be used. - * - * @return the setting of whether managed iterators will be used, - * by default false - * - * @deprecated This options is not used anymore. - */ - @Deprecated - public boolean managed() { - assert(isOwningHandle()); - return managed(nativeHandle_); - } - - /** - * Specify to create a managed iterator -- a special iterator that - * uses less resources by having the ability to free its underlying - * resources on request. - * - * @param managed if true, then managed iterators will be enabled. - * @return the reference to the current ReadOptions. - * - * @deprecated This options is not used anymore. - */ - @Deprecated - public ReadOptions setManaged(final boolean managed) { - assert(isOwningHandle()); - setManaged(nativeHandle_, managed); - return this; - } - /** * Returns whether a total seek order will be used * @@ -398,7 +367,10 @@ public ReadOptions setMaxSkippableInternalKeys( * Default: false * * @return true if keys deleted using the DeleteRange() API will be visible + * + * @deprecated This option may be remove in a future release. */ + @Deprecated public boolean ignoreRangeDeletions() { assert(isOwningHandle()); return ignoreRangeDeletions(nativeHandle_); @@ -414,7 +386,10 @@ public boolean ignoreRangeDeletions() { * @param ignoreRangeDeletions true if keys deleted using the DeleteRange() * API should be visible * @return the reference to the current ReadOptions. + * + * @deprecated This option may be remove in a future release. */ + @Deprecated public ReadOptions setIgnoreRangeDeletions(final boolean ignoreRangeDeletions) { assert(isOwningHandle()); setIgnoreRangeDeletions(nativeHandle_, ignoreRangeDeletions); @@ -813,8 +788,6 @@ protected final void disposeInternal(final long handle) { private static native void setReadTier(long handle, byte readTierValue); private static native boolean tailing(long handle); private static native void setTailing(long handle, boolean tailing); - private static native boolean managed(long handle); - private static native void setManaged(long handle, boolean managed); private static native boolean totalOrderSeek(long handle); private static native void setTotalOrderSeek(long handle, boolean totalOrderSeek); private static native boolean prefixSameAsStart(long handle); diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index 1ffb44b6a1b2..ebe134726982 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -84,13 +84,7 @@ public static void loadLibrary() { return; } - while (libraryLoaded.get() == LibraryState.LOADING) { - try { - Thread.sleep(10); - } catch(final InterruptedException e) { - //ignore - } - } + waitForLibraryToBeLoaded(); } /** @@ -146,12 +140,28 @@ public static void loadLibrary(final List paths) { return; } - while (libraryLoaded.get() == LibraryState.LOADING) { - try { - Thread.sleep(10); - } catch(final InterruptedException e) { - //ignore + waitForLibraryToBeLoaded(); + } + + private static void waitForLibraryToBeLoaded() { + final long wait = 10; // Time to wait before re-checking if another thread loaded the library + final long timeout = + 10 * 1000; // Maximum time to wait for another thread to load the library (10 seconds) + long waited = 0; + try { + while (libraryLoaded.get() == LibraryState.LOADING) { + Thread.sleep(wait); + waited += wait; + + if (waited >= timeout) { + throw new RuntimeException( + "Exceeded timeout whilst trying to load the RocksDB shared library"); + } } + } catch (final InterruptedException e) { + // restore interrupted status + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted whilst trying to load the RocksDB shared library", e); } } @@ -4074,6 +4084,23 @@ public void continueBackgroundWork() throws RocksDBException { continueBackgroundWork(nativeHandle_); } + /** + * Abort all running and pending compaction jobs. This method will signal + * all active compactions to terminate and wait for them to complete. + * No new compactions will be scheduled until {@link #resumeAllCompactions()} is called. + */ + public void abortAllCompactions() { + abortAllCompactions(nativeHandle_); + } + + /** + * Resume compaction scheduling after {@link #abortAllCompactions()} was called. + * Must be called the same number of times as {@link #abortAllCompactions()}. + */ + public void resumeAllCompactions() { + resumeAllCompactions(nativeHandle_); + } + /** * Enable automatic compactions for the given column * families if they were previously disabled. @@ -4126,6 +4153,7 @@ public int numberLevels(/* @Nullable */final ColumnFamilyHandle columnFamilyHand * * @return the maximum level */ + @Deprecated public int maxMemCompactionLevel() { return maxMemCompactionLevel(null); } @@ -4633,10 +4661,13 @@ public Range suggestCompactRange() * @param targetLevel the target level for L0 * * @throws RocksDBException if an error occurs whilst promoting L0 + * + * @deprecated this API may be removed in a future release. */ + @Deprecated public void promoteL0( - /* @Nullable */final ColumnFamilyHandle columnFamilyHandle, - final int targetLevel) throws RocksDBException { + /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final int targetLevel) + throws RocksDBException { promoteL0(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, targetLevel); @@ -4648,9 +4679,11 @@ public void promoteL0( * @param targetLevel the target level for L0 * * @throws RocksDBException if an error occurs whilst promoting L0 + * + * @deprecated this API may be removed in a future release. */ - public void promoteL0(final int targetLevel) - throws RocksDBException { + @Deprecated + public void promoteL0(final int targetLevel) throws RocksDBException { promoteL0(null, targetLevel); } @@ -5020,6 +5053,8 @@ private static native String[] compactFiles(final long handle, final long compac private static native void cancelAllBackgroundWork(final long handle, final boolean wait); private static native void pauseBackgroundWork(final long handle) throws RocksDBException; private static native void continueBackgroundWork(final long handle) throws RocksDBException; + private static native void abortAllCompactions(final long handle); + private static native void resumeAllCompactions(final long handle); private static native void enableAutoCompaction( final long handle, final long[] columnFamilyHandles) throws RocksDBException; private static native int numberLevels(final long handle, final long columnFamilyHandle); diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 3b488660e851..41e6b7239425 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -550,14 +550,14 @@ public enum TickerType { BLOB_DB_BYTES_READ((byte) -0x2), /** - * # of keys written by BlobDB as non-TTL inlined value. + * Deprecated and unused. Retained to avoid shifting enum values. */ - BLOB_DB_WRITE_INLINED((byte) -0x3), + @Deprecated BLOB_DB_WRITE_INLINED((byte) -0x3), /** - * # of keys written by BlobDB as TTL inlined value. + * Deprecated and unused. Retained to avoid shifting enum values. */ - BLOB_DB_WRITE_INLINED_TTL((byte) -0x4), + @Deprecated BLOB_DB_WRITE_INLINED_TTL((byte) -0x4), /** * # of keys written by BlobDB as non-TTL blob value. @@ -764,10 +764,14 @@ public enum TickerType { */ HOT_FILE_READ_BYTES((byte) -0x31), WARM_FILE_READ_BYTES((byte) -0x32), + COOL_FILE_READ_BYTES((byte) -0x5B), COLD_FILE_READ_BYTES((byte) -0x33), + ICE_FILE_READ_BYTES((byte) -0x59), HOT_FILE_READ_COUNT((byte) -0x34), WARM_FILE_READ_COUNT((byte) -0x35), + COOL_FILE_READ_COUNT((byte) -0x5C), COLD_FILE_READ_COUNT((byte) -0x36), + ICE_FILE_READ_COUNT((byte) -0x5A), /** * (non-)last level read statistics @@ -870,6 +874,8 @@ public enum TickerType { FIFO_TTL_COMPACTIONS((byte) -0x50), + FIFO_CHANGE_TEMPERATURE_COMPACTIONS((byte) -0x58), + PREFETCH_BYTES((byte) -0x51), PREFETCH_BYTES_USEFUL((byte) -0x52), @@ -882,6 +888,73 @@ public enum TickerType { FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT((byte) -0x57), + /** + * Counter for the number of times a WBWI is ingested into the DB. This + * happens when IngestWriteBatchWithIndex() is used and when large + * transaction optimization is enabled through + * TransactionOptions::large_txn_commit_optimize_threshold. + */ + NUMBER_WBWI_INGEST((byte) -0x5D), + + /** + * Failure to load the UDI during SST table open + */ + SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT((byte) -0x5E), + + /** + * Bytes of output files successfully resumed during remote compaction + */ + REMOTE_COMPACT_RESUMED_BYTES((byte) -0x5F), + + /** + * MultiScan statistics + */ + + /** + * # of calls to Iterator::Prepare() for multi-scan + */ + MULTISCAN_PREPARE_CALLS((byte) -0x60), + + /** + * # of errors during Iterator::Prepare() for multi-scan + */ + MULTISCAN_PREPARE_ERRORS((byte) -0x61), + + /** + * # of data blocks prefetched during multi-scan Prepare() + */ + MULTISCAN_BLOCKS_PREFETCHED((byte) -0x62), + + /** + * # of data blocks found in cache during multi-scan Prepare() + */ + MULTISCAN_BLOCKS_FROM_CACHE((byte) -0x63), + + /** + * Total bytes prefetched during multi-scan Prepare() + */ + MULTISCAN_PREFETCH_BYTES((byte) -0x64), + + /** + * # of prefetched blocks that were never accessed (wasted) + */ + MULTISCAN_PREFETCH_BLOCKS_WASTED((byte) -0x65), + + /** + * # of I/O requests issued during multi-scan Prepare() + */ + MULTISCAN_IO_REQUESTS((byte) -0x66), + + /** + * # of non-adjacent blocks coalesced into single I/O request + */ + MULTISCAN_IO_COALESCED_NONADJACENT((byte) -0x67), + + /** + * # of seek errors during multi-scan iteration + */ + MULTISCAN_SEEK_ERRORS((byte) -0x68), + TICKER_ENUM_MAX((byte) -0x54); private final byte value; diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java index d1ddcbcbe6c7..ee8656460835 100644 --- a/java/src/main/java/org/rocksdb/Transaction.java +++ b/java/src/main/java/org/rocksdb/Transaction.java @@ -203,7 +203,7 @@ public void prepare() throws RocksDBException { * Status::Busy() may be returned if the transaction could not guarantee * that there are no write conflicts. Status::TryAgain() may be returned * if the memtable history size is not large enough - * (See max_write_buffer_number_to_maintain). + * (See max_write_buffer_size_to_maintain). *

* If this transaction was created by a {@link TransactionDB}, * Status::Expired() may be returned if this transaction has lived for @@ -661,6 +661,46 @@ public List multiGetAsList(final ReadOptions readOptions, final List + * If {@link ReadOptions#snapshot()} is not set, the current version of the + * key will be read. Calling {@link #setSnapshot()} does not affect the + * version of the data returned. + *

+ * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect + * what is read from the DB but will NOT change which keys are read from this + * transaction (the keys in this transaction do not yet belong to any snapshot + * and will be fetched regardless). + *

+ * This method uses the optimized path with support for batched reads. + * + * @param readOptions Read options.= + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param keys of keys for which values need to be retrieved. + * + * @return Array of values, one for each key + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public List multiGetAsList(final ReadOptions readOptions, + final ColumnFamilyHandle columnFamilyHandle, final List keys) + throws RocksDBException { + if (keys.isEmpty()) { + return new ArrayList<>(0); + } + final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); + return Arrays.asList(multiGet( + nativeHandle_, readOptions.nativeHandle_, columnFamilyHandle.nativeHandle_, keysArray)); + } + /** * Read this key and ensure that this transaction will only * be able to be committed if this key is not written outside this @@ -689,8 +729,7 @@ public List multiGetAsList(final ReadOptions readOptions, final List result = + txn.multiGetAsList(readOptions, testCf, Arrays.asList(k1, k2, k3)); + assertThat(result).containsExactly(v1, v2, null); + } + } + } + @Test public void name() throws RocksDBException { try(final DBContainer dbContainer = startDb(); diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc index 2e2747729eb1..30ade0f38919 100644 --- a/logging/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -647,7 +647,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { } TEST_F(AutoRollLoggerTest, LogFileExistence) { - ROCKSDB_NAMESPACE::DB* db; + std::unique_ptr db; ROCKSDB_NAMESPACE::Options options; #ifdef OS_WIN // Replace all slashes in the path so windows CompSpec does not @@ -664,7 +664,6 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) { options.create_if_missing = true; ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kTestDir, &db)); ASSERT_OK(default_env->FileExists(kLogFile)); - delete db; } TEST_F(AutoRollLoggerTest, FileCreateFailure) { diff --git a/memory/memory_allocator_impl.h b/memory/memory_allocator_impl.h index f1d3b9472ccc..65ebfebb94c9 100644 --- a/memory/memory_allocator_impl.h +++ b/memory/memory_allocator_impl.h @@ -12,8 +12,8 @@ namespace ROCKSDB_NAMESPACE { -struct CustomDeleter { - CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {} +struct CacheAllocationDeleter { + CacheAllocationDeleter(MemoryAllocator* a = nullptr) : allocator(a) {} void operator()(char* ptr) const { if (allocator) { @@ -26,12 +26,12 @@ struct CustomDeleter { MemoryAllocator* allocator; }; -using CacheAllocationPtr = std::unique_ptr; +using CacheAllocationPtr = std::unique_ptr; inline CacheAllocationPtr AllocateBlock(size_t size, MemoryAllocator* allocator) { if (allocator) { - auto block = reinterpret_cast(allocator->Allocate(size)); + auto block = static_cast(allocator->Allocate(size)); return CacheAllocationPtr(block, allocator); } return CacheAllocationPtr(new char[size]); diff --git a/memory/memory_allocator_test.cc b/memory/memory_allocator_test.cc index 2ae38ec11b57..669548970ad2 100644 --- a/memory/memory_allocator_test.cc +++ b/memory/memory_allocator_test.cc @@ -83,7 +83,7 @@ TEST_P(MemoryAllocatorTest, DatabaseBlockCache) { auto cache = NewLRUCache(1024 * 1024, 6, false, 0.0, allocator_); table_options.block_cache = cache; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DB* db = nullptr; + std::unique_ptr db; Status s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_NE(db, nullptr); @@ -115,7 +115,7 @@ TEST_P(MemoryAllocatorTest, DatabaseBlockCache) { // Close database s = db->Close(); ASSERT_OK(s); - delete db; + db.reset(); ASSERT_OK(DestroyDB(dbname, options)); } diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 9fdf618fa550..d39091ec6d43 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -44,8 +44,6 @@ #include #include -#include -#include #include #include "memory/allocator.h" @@ -53,7 +51,7 @@ #include "port/port.h" #include "rocksdb/slice.h" #include "test_util/sync_point.h" -#include "util/coding.h" +#include "util/atomic.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -182,8 +180,11 @@ class InlineSkipList { // Advance to the first entry with a key >= target void Seek(const char* target); - [[nodiscard]] Status SeekAndValidate(const char* target, - bool allow_data_in_errors); + [[nodiscard]] Status SeekAndValidate( + const char* target, bool allow_data_in_errors, + bool detect_key_out_of_order, + const std::function& + key_validation_callback); // Retreat to the last entry with a key <= target void SeekForPrev(const char* target); @@ -215,18 +216,17 @@ class InlineSkipList { Comparator const compare_; Node* const head_; - // Modified only by Insert(). Read racily by readers, but stale - // values are ok. - std::atomic max_height_; // Height of the entire list + // Maximum height of any node in the list (or in the process of being added). + // Modified only by Insert(). Relaxed reads are always OK because starting + // from higher levels only helps efficiency, not correctness. + RelaxedAtomic max_height_; // seq_splice_ is a Splice used for insertions in the non-concurrent // case. It caches the prev and next found during the most recent // non-concurrent insertion. Splice* seq_splice_; - inline int GetMaxHeight() const { - return max_height_.load(std::memory_order_relaxed); - } + inline int GetMaxHeight() const { return max_height_.LoadRelaxed(); } int RandomHeight(); @@ -246,20 +246,23 @@ class InlineSkipList { bool KeyIsAfterNode(const DecodedKey& key, Node* n) const; // Returns the earliest node with a key >= key. - // Returns nullptr if there is no such node. - // @param out_of_order_node If not null, will validate the order of visited - // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be - // returned and *out_of_order_node will be set to n2. - Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const; + // Returns OK, if no corruption is found. + // node is set to the found node, or to nullptr if no node is found. + // Returns Corruption if a corruption is found. + Status FindGreaterOrEqual(const char* key, Node** node, + bool detect_key_out_of_order, + bool allow_data_in_errors, + const std::function& + key_validation_callback) const; // Returns the latest node with a key < key. // Returns head_ if there is no such node. // Fills prev[level] with pointer to previous node at "level" for every // level in [0..max_height_-1], if prev is non-null. - // @param out_of_order_node If not null, will validate the order of visited + // @param corrupted_node If not null, will validate the order of visited // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be - // returned and *out_of_order_node will be set to n2. - Node* FindLessThan(const char* key, Node** out_of_order_node) const; + // returned and *corrupted_node will be set to n2. + Node* FindLessThan(const char* key, Node** corrupted_node) const; // Return the last node in the list. // Return head_ if list is empty. @@ -311,7 +314,7 @@ struct InlineSkipList::Node { // Stores the height of the node in the memory location normally used for // next_[0]. This is used for passing data from AllocateKey to Insert. void StashHeight(const int height) { - assert(sizeof(int) <= sizeof(next_[0])); + static_assert(sizeof(int) <= sizeof(next_[0])); memcpy(static_cast(&next_[0]), &height, sizeof(int)); } @@ -332,30 +335,30 @@ struct InlineSkipList::Node { assert(n >= 0); // Use an 'acquire load' so that we observe a fully initialized // version of the returned Node. - return ((&next_[0] - n)->load(std::memory_order_acquire)); + return ((&next_[0] - n)->Load()); } void SetNext(int n, Node* x) { assert(n >= 0); // Use a 'release store' so that anybody who reads through this // pointer observes a fully initialized version of the inserted node. - (&next_[0] - n)->store(x, std::memory_order_release); + (&next_[0] - n)->Store(x); } bool CASNext(int n, Node* expected, Node* x) { assert(n >= 0); - return (&next_[0] - n)->compare_exchange_strong(expected, x); + return (&next_[0] - n)->CasStrong(expected, x); } // No-barrier variants that can be safely used in a few locations. Node* NoBarrier_Next(int n) { assert(n >= 0); - return (&next_[0] - n)->load(std::memory_order_relaxed); + return (&next_[0] - n)->LoadRelaxed(); } void NoBarrier_SetNext(int n, Node* x) { assert(n >= 0); - (&next_[0] - n)->store(x, std::memory_order_relaxed); + (&next_[0] - n)->StoreRelaxed(x); } // Insert node after prev on specific level. @@ -369,7 +372,7 @@ struct InlineSkipList::Node { private: // next_[0] is the lowest level link (level 0). Higher levels are // stored _earlier_, so level 1 is at next_[-1]. - std::atomic next_[1]; + Atomic next_[1]; }; template @@ -399,6 +402,12 @@ inline const char* InlineSkipList::Iterator::key() const { template inline void InlineSkipList::Iterator::Next() { assert(Valid()); + + // Capture the key before move on to next node + TEST_SYNC_POINT_CALLBACK( + "InlineSkipList::Iterator::Next::key", + static_cast(const_cast((node_->Key())))); + node_ = node_->Next(0); } @@ -406,6 +415,12 @@ template inline Status InlineSkipList::Iterator::NextAndValidate( bool allow_data_in_errors) { assert(Valid()); + + // Capture the key before move on to next node + TEST_SYNC_POINT_CALLBACK( + "InlineSkipList::Iterator::Next::key", + static_cast(const_cast((node_->Key())))); + Node* prev_node = node_; node_ = node_->Next(0); // Verify that keys are increasing. @@ -435,12 +450,12 @@ inline Status InlineSkipList::Iterator::PrevAndValidate( const bool allow_data_in_errors) { assert(Valid()); // Skip list validation is done in FindLessThan(). - Node* out_of_order_node = nullptr; - node_ = list_->FindLessThan(node_->Key(), &out_of_order_node); - if (out_of_order_node) { + Node* corrupted_node = nullptr; + node_ = list_->FindLessThan(node_->Key(), &corrupted_node); + if (corrupted_node) { Node* node = node_; node_ = nullptr; - return Corruption(node, out_of_order_node, allow_data_in_errors); + return Corruption(node, corrupted_node, allow_data_in_errors); } if (node_ == list_->head_) { node_ = nullptr; @@ -450,20 +465,19 @@ inline Status InlineSkipList::Iterator::PrevAndValidate( template inline void InlineSkipList::Iterator::Seek(const char* target) { - node_ = list_->FindGreaterOrEqual(target, nullptr); + auto status = + list_->FindGreaterOrEqual(target, &node_, false, false, nullptr); + assert(status.ok()); } template inline Status InlineSkipList::Iterator::SeekAndValidate( - const char* target, const bool allow_data_in_errors) { - Node* out_of_order_node = nullptr; - node_ = list_->FindGreaterOrEqual(target, &out_of_order_node); - if (out_of_order_node) { - Node* node = node_; - node_ = nullptr; - return Corruption(node, out_of_order_node, allow_data_in_errors); - } - return Status::OK(); + const char* target, const bool allow_data_in_errors, + bool check_key_out_of_order, + const std::function& key_validation_callback) { + return list_->FindGreaterOrEqual(target, &node_, allow_data_in_errors, + check_key_out_of_order, + key_validation_callback); } template @@ -530,15 +544,18 @@ bool InlineSkipList::KeyIsAfterNode(const DecodedKey& key, } template -typename InlineSkipList::Node* -InlineSkipList::FindGreaterOrEqual( - const char* key, Node** const out_of_order_node) const { +Status InlineSkipList::FindGreaterOrEqual( + const char* key, Node** node, bool allow_data_in_errors, + bool detect_key_out_of_order, + const std::function& key_validation_callback) + const { // Note: It looks like we could reduce duplication by implementing // this function as FindLessThan(key)->Next(0), but we wouldn't be able // to exit early on equality and the result wouldn't even be correct. // A concurrent insert might occur after FindLessThan(key) but before // we get a chance to call Next(0). Node* x = head_; + *node = nullptr; int level = GetMaxHeight() - 1; Node* last_bigger = nullptr; const DecodedKey key_decoded = compare_.decode_key(key); @@ -546,10 +563,16 @@ InlineSkipList::FindGreaterOrEqual( Node* next = x->Next(level); if (next != nullptr) { PREFETCH(next->Next(level), 0, 1); - if (out_of_order_node && x != head_ && + if (detect_key_out_of_order && x != head_ && compare_(x->Key(), next->Key()) >= 0) { - *out_of_order_node = next; - return x; + return Corruption(x, next, allow_data_in_errors); + } + if (key_validation_callback != nullptr) { + auto status = + key_validation_callback(next->Key(), allow_data_in_errors); + if (!status.ok()) { + return status; + } } } // Make sure the lists are sorted @@ -560,7 +583,8 @@ InlineSkipList::FindGreaterOrEqual( ? 1 : compare_(next->Key(), key_decoded); if (cmp == 0 || (cmp > 0 && level == 0)) { - return next; + *node = next; + return Status::OK(); } else if (cmp < 0) { // Keep searching in this list x = next; @@ -789,7 +813,7 @@ char* InlineSkipList::AllocateKey(size_t key_size) { template typename InlineSkipList::Node* InlineSkipList::AllocateNode(size_t key_size, int height) { - auto prefix = sizeof(std::atomic) * (height - 1); + auto prefix = sizeof(Atomic) * (height - 1); // prefix is space for the height - 1 pointers that we store before // the Node instance (next_[-(height - 1) .. -1]). Node starts at @@ -923,9 +947,9 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, int height = x->UnstashHeight(); assert(height >= 1 && height <= kMaxHeight_); - int max_height = max_height_.load(std::memory_order_relaxed); + int max_height = max_height_.LoadRelaxed(); while (height > max_height) { - if (max_height_.compare_exchange_weak(max_height, height)) { + if (max_height_.CasWeakRelaxed(max_height, height)) { // successfully updated it max_height = height; break; @@ -1116,7 +1140,9 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, template bool InlineSkipList::Contains(const char* key) const { - Node* x = FindGreaterOrEqual(key, nullptr); + Node* x = nullptr; + auto status = FindGreaterOrEqual(key, &x, false, false, nullptr); + assert(status.ok()); if (x != nullptr && Equal(key, x->Key())) { return true; } else { diff --git a/memtable/skiplist.h b/memtable/skiplist.h index f2e2a829de3b..594c6ec43ce4 100644 --- a/memtable/skiplist.h +++ b/memtable/skiplist.h @@ -34,10 +34,9 @@ #include #include -#include - #include "memory/allocator.h" #include "port/port.h" +#include "util/atomic.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -128,18 +127,16 @@ class SkipList { // Modified only by Insert(). Read racily by readers, but stale // values are ok. - std::atomic max_height_; // Height of the entire list + RelaxedAtomic max_height_; // Height of the entire list // Used for optimizing sequential insert patterns. Tricky. prev_[i] for // i up to max_height_ is the predecessor of prev_[0] and prev_height_ // is the height of prev_[0]. prev_[0] can only be equal to head before // insertion, in which case max_height_ and prev_height_ are 1. - Node** prev_; int32_t prev_height_; + Node** prev_; - inline int GetMaxHeight() const { - return max_height_.load(std::memory_order_relaxed); - } + inline int GetMaxHeight() const { return max_height_.LoadRelaxed(); } Node* NewNode(const Key& key, int height); int RandomHeight(); @@ -179,35 +176,35 @@ struct SkipList::Node { assert(n >= 0); // Use an 'acquire load' so that we observe a fully initialized // version of the returned Node. - return (next_[n].load(std::memory_order_acquire)); + return (next_[n].Load()); } void SetNext(int n, Node* x) { assert(n >= 0); // Use a 'release store' so that anybody who reads through this // pointer observes a fully initialized version of the inserted node. - next_[n].store(x, std::memory_order_release); + next_[n].Store(x); } // No-barrier variants that can be safely used in a few locations. Node* NoBarrier_Next(int n) { assert(n >= 0); - return next_[n].load(std::memory_order_relaxed); + return next_[n].LoadRelaxed(); } void NoBarrier_SetNext(int n, Node* x) { assert(n >= 0); - next_[n].store(x, std::memory_order_relaxed); + next_[n].StoreRelaxed(x); } private: // Array of length equal to the node height. next_[0] is lowest level link. - std::atomic next_[1]; + Atomic next_[1]; }; template typename SkipList::Node* SkipList::NewNode( const Key& key, int height) { - char* mem = allocator_->AllocateAligned( - sizeof(Node) + sizeof(std::atomic) * (height - 1)); + char* mem = allocator_->AllocateAligned(sizeof(Node) + + sizeof(Atomic) * (height - 1)); return new (mem) Node(key); } @@ -438,7 +435,7 @@ SkipList::SkipList(const Comparator cmp, Allocator* allocator, kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_), compare_(cmp), allocator_(allocator), - head_(NewNode(0 /* any key will do */, max_height)), + head_(NewNode({} /* any key will do */, max_height)), max_height_(1), prev_height_(1) { assert(max_height > 0 && kMaxHeight_ == static_cast(max_height)); @@ -494,7 +491,7 @@ void SkipList::Insert(const Key& key) { // the loop below. In the former case the reader will // immediately drop to the next level since nullptr sorts after all // keys. In the latter case the reader will use the new node. - max_height_.store(height, std::memory_order_relaxed); + max_height_.StoreRelaxed(height); } Node* x = NewNode(key, height); diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 93d32e9fec6e..c83baeeefcb2 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -94,11 +94,14 @@ class SkipListRep : public MemTableRep { Status GetAndValidate(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry), - bool allow_data_in_errors) override { + bool allow_data_in_errors, bool detect_key_out_of_order, + const std::function& + key_validation_callback) override { SkipListRep::Iterator iter(&skip_list_); Slice dummy_slice; - Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(), - allow_data_in_errors); + Status status = iter.SeekAndValidate( + dummy_slice, k.memtable_key().data(), allow_data_in_errors, + detect_key_out_of_order, key_validation_callback); for (; iter.Valid() && status.ok() && callback_func(callback_args, iter.key()); status = iter.NextAndValidate(allow_data_in_errors)) { @@ -244,12 +247,18 @@ class SkipListRep : public MemTableRep { } Status SeekAndValidate(const Slice& user_key, const char* memtable_key, - bool allow_data_in_errors) override { + bool allow_data_in_errors, + bool detect_key_out_of_order, + const std::function& + key_validation_callback) override { if (memtable_key != nullptr) { - return iter_.SeekAndValidate(memtable_key, allow_data_in_errors); + return iter_.SeekAndValidate(memtable_key, allow_data_in_errors, + detect_key_out_of_order, + key_validation_callback); } else { - return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key), - allow_data_in_errors); + return iter_.SeekAndValidate( + EncodeKey(&tmp_, user_key), allow_data_in_errors, + detect_key_out_of_order, key_validation_callback); } } diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc index 9b0192cb8e8e..738f89f79e9e 100644 --- a/memtable/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -30,6 +30,8 @@ class VectorRep : public MemTableRep { // collection. void Insert(KeyHandle handle) override; + void InsertConcurrently(KeyHandle handle) override; + // Returns true iff an entry that compares equal to key is in the collection. bool Contains(const char* key) const override; @@ -40,6 +42,8 @@ class VectorRep : public MemTableRep { void Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)) override; + void BatchPostProcess() override; + ~VectorRep() override = default; class Iterator : public MemTableRep::Iterator { @@ -79,6 +83,13 @@ class VectorRep : public MemTableRep { // Advance to the first entry with a key >= target void Seek(const Slice& user_key, const char* memtable_key) override; + // Seek and do some memory validation + Status SeekAndValidate(const Slice& internal_key, const char* memtable_key, + bool allow_data_in_errors, + bool detect_key_out_of_order, + const std::function& + key_validation_callback) override; + // Advance to the first entry with a key <= target void SeekForPrev(const Slice& user_key, const char* memtable_key) override; @@ -96,19 +107,40 @@ class VectorRep : public MemTableRep { private: friend class Iterator; + ALIGN_AS(CACHE_LINE_SIZE) RelaxedAtomic bucket_size_; using Bucket = std::vector; std::shared_ptr bucket_; mutable port::RWMutex rwlock_; bool immutable_; bool sorted_; const KeyComparator& compare_; + // Thread-local vector to buffer concurrent writes. + using TlBucket = std::vector; + ThreadLocalPtr tl_writes_; + + static void DeleteTlBucket(void* ptr) { + auto* v = static_cast(ptr); + delete v; + } }; void VectorRep::Insert(KeyHandle handle) { auto* key = static_cast(handle); - WriteLock l(&rwlock_); - assert(!immutable_); - bucket_->push_back(key); + { + WriteLock l(&rwlock_); + assert(!immutable_); + bucket_->push_back(key); + } + bucket_size_.FetchAddRelaxed(1); +} + +void VectorRep::InsertConcurrently(KeyHandle handle) { + auto* v = static_cast(tl_writes_.Get()); + if (!v) { + v = new TlBucket(); + tl_writes_.Reset(v); + } + v->push_back(static_cast(handle)); } // Returns true iff an entry that compares equal to key is in the collection. @@ -123,19 +155,35 @@ void VectorRep::MarkReadOnly() { } size_t VectorRep::ApproximateMemoryUsage() { - return sizeof(bucket_) + sizeof(*bucket_) + - bucket_->size() * - sizeof( - std::remove_reference::type::value_type); + return bucket_size_.LoadRelaxed() * + sizeof(std::remove_reference::type::value_type); +} + +void VectorRep::BatchPostProcess() { + auto* v = static_cast(tl_writes_.Get()); + if (v) { + { + WriteLock l(&rwlock_); + assert(!immutable_); + for (auto& key : *v) { + bucket_->push_back(key); + } + } + bucket_size_.FetchAddRelaxed(v->size()); + delete v; + tl_writes_.Reset(nullptr); + } } VectorRep::VectorRep(const KeyComparator& compare, Allocator* allocator, size_t count) : MemTableRep(allocator), + bucket_size_(0), bucket_(new Bucket()), immutable_(false), sorted_(false), - compare_(compare) { + compare_(compare), + tl_writes_(DeleteTlBucket) { bucket_.get()->reserve(count); } @@ -221,6 +269,24 @@ void VectorRep::Iterator::Seek(const Slice& user_key, .first; } +Status VectorRep::Iterator::SeekAndValidate( + const Slice& /* internal_key */, const char* /* memtable_key */, + bool /* allow_data_in_errors */, bool /* detect_key_out_of_order */, + const std::function& + /* key_validation_callback */) { + if (vrep_) { + WriteLock l(&vrep_->rwlock_); + if (bucket_->begin() == bucket_->end()) { + // Memtable is empty + return Status::OK(); + } else { + return Status::NotSupported("SeekAndValidate() not implemented"); + } + } else { + return Status::NotSupported("SeekAndValidate() not implemented"); + } +} + // Advance to the first entry with a key <= target void VectorRep::Iterator::SeekForPrev(const Slice& /*user_key*/, const char* /*memtable_key*/) { diff --git a/memtable/wbwi_memtable.cc b/memtable/wbwi_memtable.cc index 540253666908..9686eac50299 100644 --- a/memtable/wbwi_memtable.cc +++ b/memtable/wbwi_memtable.cc @@ -61,6 +61,7 @@ bool WBWIMemTable::Get(const LookupKey& key, std::string* value, assert(!wbwi_->GetWriteBatch()->HasDeleteRange()); assert(merge_context); + *out_seq = kMaxSequenceNumber; [[maybe_unused]] SequenceNumber read_seq = GetInternalKeySeqno(key.internal_key()); // This is memtable is a single write batch, no snapshot can be taken within diff --git a/memtable/wbwi_memtable.h b/memtable/wbwi_memtable.h index 3f0ae3e23d5b..b3231b4d565d 100644 --- a/memtable/wbwi_memtable.h +++ b/memtable/wbwi_memtable.h @@ -235,7 +235,7 @@ class WBWIMemTable final : public ReadOnlyMemTable { uint64_t num_entries_; // WBWI can contains updates to multiple CFs. `cf_id_` determines which CF // this memtable is for. - uint32_t cf_id_; + const uint32_t cf_id_; }; class WBWIMemTableIterator final : public InternalIterator { diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc index 2eca31f10843..dd4bbb0d68f7 100644 --- a/microbench/db_basic_bench.cc +++ b/microbench/db_basic_bench.cc @@ -138,13 +138,11 @@ static void SetupDB(benchmark::State& state, Options& options, db_path + kFilePathSeparator + test_name + std::to_string(getpid()); DestroyDB(db_name, options); - DB* db_ptr = nullptr; - s = DB::Open(options, db_name, &db_ptr); + s = DB::Open(options, db_name, db); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; } - db->reset(db_ptr); } static void TeardownDB(benchmark::State& state, const std::unique_ptr& db, @@ -181,12 +179,10 @@ static void DBOpen(benchmark::State& state) { for (auto _ : state) { { - DB* db_ptr = nullptr; - Status s = DB::Open(options, db_name, &db_ptr); + Status s = DB::Open(options, db_name, &db); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); } - db.reset(db_ptr); } state.PauseTiming(); auto wo = WriteOptions(); @@ -231,12 +227,10 @@ static void DBClose(benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); { - DB* db_ptr = nullptr; - Status s = DB::Open(options, db_name, &db_ptr); + Status s = DB::Open(options, db_name, &db); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); } - db.reset(db_ptr); } auto wo = WriteOptions(); Status s; @@ -727,13 +721,11 @@ static void SimpleGetWithPerfContext(benchmark::State& state) { DestroyDB(db_name, options); { - DB* db_ptr = nullptr; - s = DB::Open(options, db_name, &db_ptr); + s = DB::Open(options, db_name, &db); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; } - db.reset(db_ptr); } // load db auto wo = WriteOptions(); diff --git a/microbench/ribbon_bench.cc b/microbench/ribbon_bench.cc index d0fb2ec9ab2e..58cd710a4c70 100644 --- a/microbench/ribbon_bench.cc +++ b/microbench/ribbon_bench.cc @@ -32,7 +32,7 @@ struct KeyMaker { // To get range [avg_size - 2, avg_size + 2] // use range [smallest_size, smallest_size + 4] len += FastRange32((val_num >> 5) * 1234567891, 5); - char *data = buf_.get() + start; + char* data = buf_.get() + start; // Populate key data such that all data makes it into a key of at // least 8 bytes. We also don't want all the within-filter key // variance confined to a contiguous 32 bits, because then a 32 bit @@ -51,7 +51,7 @@ struct KeyMaker { // 1. filter config bits_per_key // 2. average data key length // 3. data entry number -static void CustomArguments(benchmark::internal::Benchmark *b) { +static void CustomArguments(benchmark::internal::Benchmark* b) { const auto kImplCount = static_cast(BloomLikeFilterPolicy::GetAllFixedImpls().size()); for (int filter_impl = 0; filter_impl < kImplCount; ++filter_impl) { @@ -66,7 +66,7 @@ static void CustomArguments(benchmark::internal::Benchmark *b) { b->ArgNames({"filter_impl", "bits_per_key", "key_len_avg", "entry_num"}); } -static void FilterBuild(benchmark::State &state) { +static void FilterBuild(benchmark::State& state) { // setup data auto filter = BloomLikeFilterPolicy::Create( BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)), @@ -89,7 +89,7 @@ static void FilterBuild(benchmark::State &state) { } BENCHMARK(FilterBuild)->Apply(CustomArguments); -static void FilterQueryPositive(benchmark::State &state) { +static void FilterQueryPositive(benchmark::State& state) { // setup data auto filter = BloomLikeFilterPolicy::Create( BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)), @@ -117,7 +117,7 @@ static void FilterQueryPositive(benchmark::State &state) { } BENCHMARK(FilterQueryPositive)->Apply(CustomArguments); -static void FilterQueryNegative(benchmark::State &state) { +static void FilterQueryNegative(benchmark::State& state) { // setup data auto filter = BloomLikeFilterPolicy::Create( BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)), diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc index 04e98914da9c..9f96655a6b48 100644 --- a/monitoring/iostats_context.cc +++ b/monitoring/iostats_context.cc @@ -65,9 +65,11 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const { IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos); IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read); IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cool_file_bytes_read); IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read); IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count); IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cool_file_read_count); IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count); std::string str = ss.str(); str.erase(str.find_last_not_of(", ") + 1); diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index a38f6ec01805..59f5f19f66df 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -259,10 +259,10 @@ void PerfContext::Reset() { #endif } -void PerfContextByLevel::Reset(){ +void PerfContextByLevel::Reset() { #ifndef NPERF_CONTEXT #define EMIT_FIELDS(x) x = 0; - DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS) + DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS) #undef EMIT_FIELDS #endif } diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 05163d3e29e1..e6060cbeac20 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -93,6 +93,7 @@ const std::vector> TickersNameMap = { {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, "rocksdb.compaction.optimized.del.drop.obsolete"}, {COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"}, + {COMPACTION_ABORTED, "rocksdb.compaction.aborted"}, {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"}, {NUMBER_KEYS_READ, "rocksdb.number.keys.read"}, {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"}, @@ -169,8 +170,8 @@ const std::vector> TickersNameMap = { {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"}, {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"}, {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"}, - {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"}, - {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"}, + {BLOB_DB_WRITE_INLINED_DEPRECATED, "rocksdb.blobdb.write.inlined"}, + {BLOB_DB_WRITE_INLINED_TTL_DEPRECATED, "rocksdb.blobdb.write.inlined.ttl"}, {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"}, {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"}, {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"}, @@ -224,12 +225,17 @@ const std::vector> TickersNameMap = { {BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"}, {REMOTE_COMPACT_READ_BYTES, "rocksdb.remote.compact.read.bytes"}, {REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"}, + {REMOTE_COMPACT_RESUMED_BYTES, "rocksdb.remote.compact.resumed.bytes"}, {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"}, {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"}, + {COOL_FILE_READ_BYTES, "rocksdb.cool.file.read.bytes"}, {COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"}, + {ICE_FILE_READ_BYTES, "rocksdb.ice.file.read.bytes"}, {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"}, {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"}, + {COOL_FILE_READ_COUNT, "rocksdb.cool.file.read.count"}, {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"}, + {ICE_FILE_READ_COUNT, "rocksdb.ice.file.read.count"}, {LAST_LEVEL_READ_BYTES, "rocksdb.last.level.read.bytes"}, {LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"}, {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"}, @@ -262,6 +268,8 @@ const std::vector> TickersNameMap = { {READAHEAD_TRIMMED, "rocksdb.readahead.trimmed"}, {FIFO_MAX_SIZE_COMPACTIONS, "rocksdb.fifo.max.size.compactions"}, {FIFO_TTL_COMPACTIONS, "rocksdb.fifo.ttl.compactions"}, + {FIFO_CHANGE_TEMPERATURE_COMPACTIONS, + "rocksdb.fifo.change_temperature.compactions"}, {PREFETCH_BYTES, "rocksdb.prefetch.bytes"}, {PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"}, {PREFETCH_HITS, "rocksdb.prefetch.hits"}, @@ -270,6 +278,24 @@ const std::vector> TickersNameMap = { "rocksdb.file.read.corruption.retry.count"}, {FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT, "rocksdb.file.read.corruption.retry.success.count"}, + {NUMBER_WBWI_INGEST, "rocksdb.number.wbwi.ingest"}, + {SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT, + "rocksdb.sst.user.defined.index.load.fail.count"}, + {MULTISCAN_PREPARE_CALLS, "rocksdb.multiscan.prepare.calls"}, + {MULTISCAN_PREPARE_ERRORS, "rocksdb.multiscan.prepare.errors"}, + {MULTISCAN_BLOCKS_PREFETCHED, "rocksdb.multiscan.blocks.prefetched"}, + {MULTISCAN_BLOCKS_FROM_CACHE, "rocksdb.multiscan.blocks.from.cache"}, + {MULTISCAN_PREFETCH_BYTES, "rocksdb.multiscan.prefetch.bytes"}, + {MULTISCAN_PREFETCH_BLOCKS_WASTED, + "rocksdb.multiscan.prefetch.blocks.wasted"}, + {MULTISCAN_IO_REQUESTS, "rocksdb.multiscan.io.requests"}, + {MULTISCAN_IO_COALESCED_NONADJACENT, + "rocksdb.multiscan.io.coalesced.nonadjacent"}, + {MULTISCAN_SEEK_ERRORS, "rocksdb.multiscan.seek.errors"}, + {PREFETCH_MEMORY_BYTES_GRANTED, "rocksdb.prefetch.memory.bytes.granted"}, + {PREFETCH_MEMORY_BYTES_RELEASED, "rocksdb.prefetch.memory.bytes.released"}, + {PREFETCH_MEMORY_REQUESTS_BLOCKED, + "rocksdb.prefetch.memory.requests.blocked"}, }; const std::vector> HistogramsNameMap = { @@ -336,10 +362,16 @@ const std::vector> HistogramsNameMap = { "rocksdb.error.handler.autoresume.retry.count"}, {ASYNC_READ_BYTES, "rocksdb.async.read.bytes"}, {POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"}, + {COMPACTION_PREFETCH_BYTES, "rocksdb.compaction.prefetch.bytes"}, {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"}, {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"}, {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, "rocksdb.table.open.prefetch.tail.read.bytes"}, + {NUM_OP_PER_TRANSACTION, "rocksdb.num.op.per.transaction"}, + {MULTISCAN_PREPARE_ITERATORS, + "rocksdb.multiscan.op.prepare.iterators.micros"}, + {MULTISCAN_PREPARE_MICROS, "rocksdb.multiscan.prepare.micros"}, + {MULTISCAN_BLOCKS_PER_PREPARE, "rocksdb.multiscan.blocks.per.prepare"}, }; std::shared_ptr CreateDBStatistics() { diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index 295e7bf3daa3..f98917a5f4a3 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -185,7 +185,7 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) { TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { constexpr int kPeriodSec = 1; - constexpr int kEstimatedOneSliceSize = 16000; + constexpr int kEstimatedOneSliceSize = 22100; Options options; options.create_if_missing = true; @@ -277,7 +277,7 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { // If `slice_count == 0` when new statistics are added, consider increasing // `kEstimatedOneSliceSize` ASSERT_EQ(slice_count, 1); - ASSERT_TRUE(stats_history_size_reopen < 16000 && + ASSERT_TRUE(stats_history_size_reopen < kEstimatedOneSliceSize && stats_history_size_reopen > 0); ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0); Close(); @@ -616,7 +616,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { // LogNumbers: default: 16, stats: 10, pikachu: 5 // Since in recovery process, cfd_stats column is created after WAL is // created, synced and MANIFEST is persisted, its log number which depends on - // logfile_number_ will be different. Since "pikachu" is never flushed, thus + // cur_wal_number_ will be different. Since "pikachu" is never flushed, thus // its log_number should be the smallest of the three. ASSERT_OK(Flush()); ASSERT_LT(cfd_test->GetLogNumber(), cfd_stats->GetLogNumber()); diff --git a/monitoring/thread_status_impl.cc b/monitoring/thread_status_impl.cc index 153753682cfa..2b3041c4c61d 100644 --- a/monitoring/thread_status_impl.cc +++ b/monitoring/thread_status_impl.cc @@ -13,7 +13,9 @@ namespace ROCKSDB_NAMESPACE { -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS +const bool ThreadStatus::kEnabled = true; + std::string ThreadStatus::GetThreadTypeName( ThreadStatus::ThreadType thread_type) { switch (thread_type) { @@ -117,6 +119,7 @@ std::map ThreadStatus::InterpretOperationProperties( } #else +const bool ThreadStatus::kEnabled = false; std::string ThreadStatus::GetThreadTypeName( ThreadStatus::ThreadType /*thread_type*/) { @@ -159,5 +162,5 @@ std::map ThreadStatus::InterpretOperationProperties( return std::map(); } -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc index 37fcef62b0f9..7df2b2c6fa4b 100644 --- a/monitoring/thread_status_updater.cc +++ b/monitoring/thread_status_updater.cc @@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE { -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS thread_local ThreadStatusData* ThreadStatusUpdater::thread_status_data_ = nullptr; @@ -324,5 +324,5 @@ void ThreadStatusUpdater::SetThreadOperationProperty(int /*i*/, void ThreadStatusUpdater::IncreaseThreadOperationProperty(int /*i*/, uint64_t /*delta*/) {} -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/thread_status_updater.h b/monitoring/thread_status_updater.h index 696063cb46cd..6d3bc74c4510 100644 --- a/monitoring/thread_status_updater.h +++ b/monitoring/thread_status_updater.h @@ -47,7 +47,7 @@ class ColumnFamilyHandle; // The structure that keeps constant information about a column family. struct ConstantColumnFamilyInfo { -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS public: ConstantColumnFamilyInfo(const void* _db_key, const std::string& _db_name, const std::string& _cf_name) @@ -55,13 +55,13 @@ struct ConstantColumnFamilyInfo { const void* db_key; const std::string db_name; const std::string cf_name; -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS }; // the internal data-structure that is used to reflect the current // status of a thread using a set of atomic pointers. struct ThreadStatusData { -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS explicit ThreadStatusData() { enable_tracking.store(false); thread_id.store(0); @@ -86,7 +86,7 @@ struct ThreadStatusData { std::atomic operation_stage; std::atomic op_properties[ThreadStatus::kNumOperationProperties]; std::atomic state_type; -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS }; // The class that stores and updates the status of the current thread @@ -190,7 +190,7 @@ class ThreadStatusUpdater { const std::vector& handles, bool check_exist); protected: -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS // The thread-local variable for storing thread status. static thread_local ThreadStatusData* thread_status_data_; @@ -220,7 +220,7 @@ class ThreadStatusUpdater { #else static ThreadStatusData* thread_status_data_; -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS }; } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/thread_status_updater_debug.cc b/monitoring/thread_status_updater_debug.cc index 464c23bbaa89..39b3ef2d0167 100644 --- a/monitoring/thread_status_updater_debug.cc +++ b/monitoring/thread_status_updater_debug.cc @@ -12,7 +12,7 @@ namespace ROCKSDB_NAMESPACE { #ifndef NDEBUG -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap( const std::vector& handles, bool check_exist) { std::unique_lock lock(thread_list_mutex_); @@ -37,7 +37,7 @@ void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap( const std::vector& /*handles*/, bool /*check_exist*/) { } -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS #endif // !NDEBUG } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc index d61bcba1ce55..d84f46a681bd 100644 --- a/monitoring/thread_status_util.cc +++ b/monitoring/thread_status_util.cc @@ -11,7 +11,7 @@ namespace ROCKSDB_NAMESPACE { -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS thread_local ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr; thread_local bool ThreadStatusUtil::thread_updater_initialized_ = false; @@ -171,9 +171,10 @@ AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() { ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr; bool ThreadStatusUtil::thread_updater_initialized_ = false; -bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) { - return false; -} +void ThreadStatusUtil::RegisterThread( + const Env* /*env*/, ThreadStatus::ThreadType /*thread_type*/) {} + +void ThreadStatusUtil::UnregisterThread() {} void ThreadStatusUtil::SetEnableTracking(bool /*enable_tracking*/) {} @@ -204,11 +205,15 @@ void ThreadStatusUtil::EraseDatabaseInfo(const DB* /*db*/) {} void ThreadStatusUtil::ResetThreadStatus() {} +bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) { + return false; +} + AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater( ThreadStatus::OperationStage /*stage*/) {} AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {} -#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NROCKSDB_THREAD_STATUS } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/thread_status_util.h b/monitoring/thread_status_util.h index df148a039565..082dbd7324b3 100644 --- a/monitoring/thread_status_util.h +++ b/monitoring/thread_status_util.h @@ -90,7 +90,7 @@ class ThreadStatusUtil { // a non-null pointer. static bool MaybeInitThreadLocalUpdater(const Env* env); -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS // A boolean flag indicating whether thread_updater_local_cache_ // is initialized. It is set to true when an Env uses any // ThreadStatusUtil functions using the current thread other @@ -130,7 +130,7 @@ class AutoThreadOperationStageUpdater { explicit AutoThreadOperationStageUpdater(ThreadStatus::OperationStage stage); ~AutoThreadOperationStageUpdater(); -#ifdef ROCKSDB_USING_THREAD_STATUS +#ifndef NROCKSDB_THREAD_STATUS private: ThreadStatus::OperationStage prev_stage_; #endif diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc index a8233f78c623..7b6211bb5448 100644 --- a/monitoring/thread_status_util_debug.cc +++ b/monitoring/thread_status_util_debug.cc @@ -50,8 +50,9 @@ Env::IOActivity ThreadStatusUtil::TEST_GetExpectedIOActivity( return Env::IOActivity::kGetEntity; case ThreadStatus::OperationType::OP_MULTIGETENTITY: return Env::IOActivity::kMultiGetEntity; - case ThreadStatus::OperationType::OP_READ_MANIFEST: - return Env::IOActivity::kReadManifest; + case ThreadStatus::OperationType:: + OP_GET_FILE_CHECKSUMS_FROM_CURRENT_MANIFEST: + return Env::IOActivity::kGetFileChecksumsFromCurrentManifest; default: return Env::IOActivity::kUnknown; } diff --git a/options/cf_options.cc b/options/cf_options.cc index d50eade93209..2ba56e0f36d8 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -30,6 +30,7 @@ #include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/options_type.h" #include "util/cast_util.h" +#include "util/string_util.h" // NOTE: in this file, many option flags that were deprecated // and removed from the rest of the code have to be kept here @@ -301,7 +302,24 @@ static std::unordered_map OptionTypeInfo::Struct("file_temperature_age_thresholds", &file_temperature_age_type_info, 0, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable))}}; + OptionTypeFlags::kMutable))}, + {"allow_trivial_copy_when_change_temperature", + {offsetof(struct CompactionOptionsFIFO, + allow_trivial_copy_when_change_temperature), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"trivial_copy_buffer_size", + {offsetof(struct CompactionOptionsFIFO, trivial_copy_buffer_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_data_files_size", + {offsetof(struct CompactionOptionsFIFO, max_data_files_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"use_kv_ratio_compaction", + {offsetof(struct CompactionOptionsFIFO, use_kv_ratio_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}}; static std::unordered_map universal_compaction_options_type_info = { @@ -340,6 +358,10 @@ static std::unordered_map OptionTypeFlags::kMutable}}, {"allow_trivial_move", {offsetof(class CompactionOptionsUniversal, allow_trivial_move), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"reduce_file_locking", + {offsetof(class CompactionOptionsUniversal, reduce_file_locking), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}}; @@ -382,6 +404,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, paranoid_file_checks), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"verify_output_flags", + {offsetof(struct MutableCFOptions, verify_output_flags), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"verify_checksums_in_compaction", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, @@ -437,6 +463,10 @@ static std::unordered_map {offsetof(struct MutableCFOptions, target_file_size_multiplier), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"target_file_size_is_upper_bound", + {offsetof(struct MutableCFOptions, target_file_size_is_upper_bound), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"arena_block_size", {offsetof(struct MutableCFOptions, arena_block_size), OptionType::kSizeT, OptionVerificationType::kNormal, @@ -649,6 +679,11 @@ static std::unordered_map {offsetof(struct MutableCFOptions, paranoid_memory_checks), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"memtable_veirfy_per_key_checksum_on_seek", + {offsetof(struct MutableCFOptions, + memtable_veirfy_per_key_checksum_on_seek), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {kOptNameCompOpts, OptionTypeInfo::Struct( kOptNameCompOpts, &compression_options_type_info, @@ -689,12 +724,24 @@ static std::unordered_map name, value, addr); } })}, + {"compression_manager", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct MutableCFOptions, compression_manager), + OptionVerificationType::kByNameAllowNull, + (OptionTypeFlags::kMutable | OptionTypeFlags::kAllowNull))}, // End special case properties {"memtable_max_range_deletions", {offsetof(struct MutableCFOptions, memtable_max_range_deletions), OptionType::kUInt32T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, - + {"memtable_op_scan_flush_trigger", + {offsetof(struct MutableCFOptions, memtable_op_scan_flush_trigger), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_avg_op_scan_flush_trigger", + {offsetof(struct MutableCFOptions, memtable_avg_op_scan_flush_trigger), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, }; static std::unordered_map @@ -736,6 +783,10 @@ static std::unordered_map {offsetof(struct ImmutableCFOptions, force_consistency_checks), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"disallow_memtable_writes", + {offsetof(struct ImmutableCFOptions, disallow_memtable_writes), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"default_temperature", {offsetof(struct ImmutableCFOptions, default_temperature), OptionType::kTemperature, OptionVerificationType::kNormal, @@ -745,9 +796,7 @@ static std::unordered_map {0, OptionType::kInt, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, {"max_write_buffer_number_to_maintain", - {offsetof(struct ImmutableCFOptions, - max_write_buffer_number_to_maintain), - OptionType::kInt, OptionVerificationType::kNormal, + {0, OptionType::kInt, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone, nullptr}}, {"max_write_buffer_size_to_maintain", {offsetof(struct ImmutableCFOptions, @@ -866,6 +915,10 @@ static std::unordered_map {offsetof(struct ImmutableCFOptions, persist_user_defined_timestamps), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kCompareLoose}}, + {"cf_allow_ingest_behind", + {offsetof(struct ImmutableCFOptions, cf_allow_ingest_behind), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, }; const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions"; @@ -983,8 +1036,6 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) compaction_filter_factory(cf_options.compaction_filter_factory), min_write_buffer_number_to_merge( cf_options.min_write_buffer_number_to_merge), - max_write_buffer_number_to_maintain( - cf_options.max_write_buffer_number_to_maintain), max_write_buffer_size_to_maintain( cf_options.max_write_buffer_size_to_maintain), inplace_update_support(cf_options.inplace_update_support), @@ -998,6 +1049,7 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) num_levels(cf_options.num_levels), optimize_filters_for_hits(cf_options.optimize_filters_for_hits), force_consistency_checks(cf_options.force_consistency_checks), + disallow_memtable_writes(cf_options.disallow_memtable_writes), default_temperature(cf_options.default_temperature), memtable_insert_with_hint_prefix_extractor( cf_options.memtable_insert_with_hint_prefix_extractor), @@ -1006,7 +1058,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) sst_partitioner_factory(cf_options.sst_partitioner_factory), blob_cache(cf_options.blob_cache), persist_user_defined_timestamps( - cf_options.persist_user_defined_timestamps) {} + cf_options.persist_user_defined_timestamps), + cf_allow_ingest_behind(cf_options.cf_allow_ingest_behind) {} ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {} @@ -1034,10 +1087,12 @@ uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) { if (op1 == 0 || op2 <= 0) { return 0; } - if (std::numeric_limits::max() / op1 < op2) { - return op1; + + if (op1 * op2 < static_cast(std::numeric_limits::max())) { + return static_cast(op1 * op2); } - return static_cast(op1 * op2); + + return op1; } // when level_compaction_dynamic_level_bytes is true and leveled compaction @@ -1132,6 +1187,8 @@ void MutableCFOptions::Dump(Logger* log) const { target_file_size_base); ROCKS_LOG_INFO(log, " target_file_size_multiplier: %d", target_file_size_multiplier); + ROCKS_LOG_INFO(log, " target_file_size_is_upper_bound: %d", + target_file_size_is_upper_bound); ROCKS_LOG_INFO(log, " max_bytes_for_level_base: %" PRIu64, max_bytes_for_level_base); ROCKS_LOG_INFO(log, " max_bytes_for_level_multiplier: %f", @@ -1147,6 +1204,8 @@ void MutableCFOptions::Dump(Logger* log) const { preserve_internal_time_seconds); ROCKS_LOG_INFO(log, " paranoid_memory_checks: %d", paranoid_memory_checks); + ROCKS_LOG_INFO(log, "memtable_veirfy_per_key_checksum_on_seek: %d", + memtable_veirfy_per_key_checksum_on_seek); std::string result; char buf[10]; for (const auto m : max_bytes_for_level_multiplier_additional) { @@ -1175,7 +1234,10 @@ void MutableCFOptions::Dump(Logger* log) const { bottommost_file_compaction_delay); ROCKS_LOG_INFO(log, " uncache_aggressiveness: %" PRIu32, uncache_aggressiveness); - + ROCKS_LOG_INFO(log, " memtable_op_scan_flush_trigger: %" PRIu32, + memtable_op_scan_flush_trigger); + ROCKS_LOG_INFO(log, " memtable_avg_op_scan_flush_trigger: %" PRIu32, + memtable_avg_op_scan_flush_trigger); // Universal Compaction Options ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d", compaction_options_universal.size_ratio); @@ -1198,12 +1260,18 @@ void MutableCFOptions::Dump(Logger* log) const { static_cast(compaction_options_universal.allow_trivial_move)); ROCKS_LOG_INFO(log, "compaction_options_universal.incremental : %d", static_cast(compaction_options_universal.incremental)); + ROCKS_LOG_INFO(log, "compaction_options_universal.reduce_file_locking : %d", + compaction_options_universal.reduce_file_locking); // FIFO Compaction Options ROCKS_LOG_INFO(log, "compaction_options_fifo.max_table_files_size : %" PRIu64, compaction_options_fifo.max_table_files_size); ROCKS_LOG_INFO(log, "compaction_options_fifo.allow_compaction : %d", compaction_options_fifo.allow_compaction); + ROCKS_LOG_INFO(log, "compaction_options_fifo.max_data_files_size : %" PRIu64, + compaction_options_fifo.max_data_files_size); + ROCKS_LOG_INFO(log, "compaction_options_fifo.use_kv_ratio_compaction : %d", + compaction_options_fifo.use_kv_ratio_compaction); // Blob file related options ROCKS_LOG_INFO(log, " enable_blob_files: %s", diff --git a/options/cf_options.h b/options/cf_options.h index 751e7b46d52b..3f5804445142 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -40,8 +40,6 @@ struct ImmutableCFOptions { int min_write_buffer_number_to_merge; - int max_write_buffer_number_to_maintain; - int64_t max_write_buffer_size_to_maintain; bool inplace_update_support; @@ -68,6 +66,8 @@ struct ImmutableCFOptions { bool force_consistency_checks; + bool disallow_memtable_writes; + Temperature default_temperature; std::shared_ptr @@ -82,6 +82,8 @@ struct ImmutableCFOptions { std::shared_ptr blob_cache; bool persist_user_defined_timestamps; + + bool cf_allow_ingest_behind; }; struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions { @@ -130,6 +132,8 @@ struct MutableCFOptions { max_compaction_bytes(options.max_compaction_bytes), target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), + target_file_size_is_upper_bound( + options.target_file_size_is_upper_bound), max_bytes_for_level_base(options.max_bytes_for_level_base), max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), ttl(options.ttl), @@ -141,6 +145,7 @@ struct MutableCFOptions { preclude_last_level_data_seconds( options.preclude_last_level_data_seconds), preserve_internal_time_seconds(options.preserve_internal_time_seconds), + verify_output_flags(options.verify_output_flags), enable_blob_files(options.enable_blob_files), min_blob_size(options.min_blob_size), blob_file_size(options.blob_file_size), @@ -161,19 +166,25 @@ struct MutableCFOptions { bottommost_compression(options.bottommost_compression), compression_opts(options.compression_opts), bottommost_compression_opts(options.bottommost_compression_opts), + compression_manager(options.compression_manager), last_level_temperature(options.last_level_temperature), default_write_temperature(options.default_write_temperature), memtable_protection_bytes_per_key( options.memtable_protection_bytes_per_key), block_protection_bytes_per_key(options.block_protection_bytes_per_key), paranoid_memory_checks(options.paranoid_memory_checks), + memtable_veirfy_per_key_checksum_on_seek( + options.memtable_veirfy_per_key_checksum_on_seek), sample_for_compression( options.sample_for_compression), // TODO: is 0 fine here? compression_per_level(options.compression_per_level), memtable_max_range_deletions(options.memtable_max_range_deletions), bottommost_file_compaction_delay( options.bottommost_file_compaction_delay), - uncache_aggressiveness(options.uncache_aggressiveness) { + uncache_aggressiveness(options.uncache_aggressiveness), + memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger), + memtable_avg_op_scan_flush_trigger( + options.memtable_avg_op_scan_flush_trigger) { RefreshDerivedOptions(options.num_levels, options.compaction_style); } @@ -198,6 +209,7 @@ struct MutableCFOptions { max_compaction_bytes(0), target_file_size_base(0), target_file_size_multiplier(0), + target_file_size_is_upper_bound(false), max_bytes_for_level_base(0), max_bytes_for_level_multiplier(0), ttl(0), @@ -205,6 +217,7 @@ struct MutableCFOptions { compaction_options_fifo(), preclude_last_level_data_seconds(0), preserve_internal_time_seconds(0), + verify_output_flags(VerifyOutputFlags::kVerifyNone), enable_blob_files(false), min_blob_size(0), blob_file_size(0), @@ -225,10 +238,13 @@ struct MutableCFOptions { memtable_protection_bytes_per_key(0), block_protection_bytes_per_key(0), paranoid_memory_checks(false), + memtable_veirfy_per_key_checksum_on_seek(false), sample_for_compression(0), memtable_max_range_deletions(0), bottommost_file_compaction_delay(0), - uncache_aggressiveness(0) {} + uncache_aggressiveness(0), + memtable_op_scan_flush_trigger(0), + memtable_avg_op_scan_flush_trigger(0) {} explicit MutableCFOptions(const Options& options); @@ -249,9 +265,7 @@ struct MutableCFOptions { void Dump(Logger* log) const; -#if __cplusplus >= 202002L bool operator==(const MutableCFOptions& rhs) const = default; -#endif // Memtable related options size_t write_buffer_size; @@ -295,6 +309,7 @@ struct MutableCFOptions { uint64_t max_compaction_bytes; uint64_t target_file_size_base; int target_file_size_multiplier; + bool target_file_size_is_upper_bound; uint64_t max_bytes_for_level_base; double max_bytes_for_level_multiplier; uint64_t ttl; @@ -304,6 +319,7 @@ struct MutableCFOptions { CompactionOptionsUniversal compaction_options_universal; uint64_t preclude_last_level_data_seconds; uint64_t preserve_internal_time_seconds; + VerifyOutputFlags verify_output_flags; // Blob file related options bool enable_blob_files; @@ -325,17 +341,21 @@ struct MutableCFOptions { CompressionType bottommost_compression; CompressionOptions compression_opts; CompressionOptions bottommost_compression_opts; + std::shared_ptr compression_manager; Temperature last_level_temperature; Temperature default_write_temperature; uint32_t memtable_protection_bytes_per_key; uint8_t block_protection_bytes_per_key; bool paranoid_memory_checks; + bool memtable_veirfy_per_key_checksum_on_seek; uint64_t sample_for_compression; std::vector compression_per_level; uint32_t memtable_max_range_deletions; uint32_t bottommost_file_compaction_delay; uint32_t uncache_aggressiveness; + uint32_t memtable_op_scan_flush_trigger; + uint32_t memtable_avg_op_scan_flush_trigger; // Derived options // Per-level target file size. diff --git a/options/configurable.cc b/options/configurable.cc index 76ea54116a23..fe1f7efc9ab7 100644 --- a/options/configurable.cc +++ b/options/configurable.cc @@ -272,7 +272,8 @@ Status ConfigurableHelper::ConfigureOptions( if (config_options.ignore_unknown_options) { s = Status::OK(); } else if (s.ok() && unused == nullptr && !remaining.empty()) { - s = Status::NotFound("Could not find option: ", remaining.begin()->first); + s = Status::NotFound("Extra option not recognized", + remaining.begin()->first); } return s; } @@ -369,7 +370,7 @@ Status ConfigurableHelper::ConfigureSingleOption( const auto opt_info = FindOption(configurable, opt_name, &elem_name, &opt_ptr); if (opt_info == nullptr) { - return Status::NotFound("Could not find option: ", name); + return Status::NotFound("Could not find option", name); } else { return ConfigureOption(config_options, configurable, *opt_info, opt_name, elem_name, value, opt_ptr); @@ -465,7 +466,7 @@ Status ConfigurableHelper::ConfigureOption( return configurable.ParseOption(config_options, opt_info, name, value, opt_ptr); } else { - return Status::NotFound("Could not find option: ", name); + return Status::NotFound("Unknown how to configure option", name); } } diff --git a/options/customizable_test.cc b/options/customizable_test.cc index 8549e7947fa8..53eac3cec182 100644 --- a/options/customizable_test.cc +++ b/options/customizable_test.cc @@ -1281,8 +1281,6 @@ class MockSliceTransform : public SliceTransform { Slice Transform(const Slice& /*key*/) const override { return Slice(); } bool InDomain(const Slice& /*key*/) const override { return false; } - - bool InRange(const Slice& /*key*/) const override { return false; } }; class MockMemoryAllocator : public BaseMemoryAllocator { diff --git a/options/db_options.cc b/options/db_options.cc index ea8f4b22d7be..2384355264c2 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -124,6 +124,18 @@ static std::unordered_map {offsetof(struct MutableDBOptions, max_background_flushes), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"max_manifest_file_size", + {offsetof(struct MutableDBOptions, max_manifest_file_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_manifest_space_amp_pct", + {offsetof(struct MutableDBOptions, max_manifest_space_amp_pct), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"manifest_preallocation_size", + {offsetof(struct MutableDBOptions, manifest_preallocation_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"daily_offpeak_time_utc", {offsetof(struct MutableDBOptions, daily_offpeak_time_utc), OptionType::kString, OptionVerificationType::kNormal, @@ -141,6 +153,7 @@ static std::unordered_map std::shared_ptr statistics; std::vector db_paths; FileTypeSet checksum_handoff_file_types; + CompactionStyleSet calculate_sst_write_lifetime_hint_set; */ {"advise_random_on_open", {offsetof(struct ImmutableDBOptions, advise_random_on_open), @@ -246,9 +259,7 @@ static std::unordered_map OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"skip_checking_sst_file_sizes_on_db_open", - {offsetof(struct ImmutableDBOptions, - skip_checking_sst_file_sizes_on_db_open), - OptionType::kBoolean, OptionVerificationType::kNormal, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, {"new_table_reader_for_compaction_inputs", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, @@ -287,10 +298,6 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, log_file_time_to_roll), OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"manifest_preallocation_size", - {offsetof(struct ImmutableDBOptions, manifest_preallocation_size), - OptionType::kSizeT, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, {"max_log_file_size", {offsetof(struct ImmutableDBOptions, max_log_file_size), OptionType::kSizeT, OptionVerificationType::kNormal, @@ -309,17 +316,12 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"max_manifest_file_size", - {offsetof(struct ImmutableDBOptions, max_manifest_file_size), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, {"persist_stats_to_disk", {offsetof(struct ImmutableDBOptions, persist_stats_to_disk), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"fail_if_options_file_error", - {offsetof(struct ImmutableDBOptions, fail_if_options_file_error), - OptionType::kBoolean, OptionVerificationType::kNormal, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, {"enable_pipelined_write", {offsetof(struct ImmutableDBOptions, enable_pipelined_write), @@ -657,7 +659,7 @@ class DBOptionsConfigurable : public MutableDBConfigurable { explicit DBOptionsConfigurable( const DBOptions& opts, const std::unordered_map* map = nullptr) - : MutableDBConfigurable(MutableDBOptions(opts), map), db_options_(opts) { + : MutableDBConfigurable(MutableDBOptions{opts}, map), db_options_(opts) { // The ImmutableDBOptions currently requires the env to be non-null. Make // sure it is if (opts.env != nullptr) { @@ -708,7 +710,7 @@ std::unique_ptr DBOptionsAsConfigurable( return ptr; } -ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {} +ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(DBOptions{}) {} ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) : create_if_missing(options.create_if_missing), @@ -737,13 +739,11 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) log_file_time_to_roll(options.log_file_time_to_roll), keep_log_file_num(options.keep_log_file_num), recycle_log_file_num(options.recycle_log_file_num), - max_manifest_file_size(options.max_manifest_file_size), table_cache_numshardbits(options.table_cache_numshardbits), WAL_ttl_seconds(options.WAL_ttl_seconds), WAL_size_limit_MB(options.WAL_size_limit_MB), max_write_batch_group_size_bytes( options.max_write_batch_group_size_bytes), - manifest_preallocation_size(options.manifest_preallocation_size), allow_mmap_reads(options.allow_mmap_reads), allow_mmap_writes(options.allow_mmap_writes), use_direct_reads(options.use_direct_reads), @@ -765,13 +765,10 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) write_thread_max_yield_usec(options.write_thread_max_yield_usec), write_thread_slow_yield_usec(options.write_thread_slow_yield_usec), skip_stats_update_on_db_open(options.skip_stats_update_on_db_open), - skip_checking_sst_file_sizes_on_db_open( - options.skip_checking_sst_file_sizes_on_db_open), wal_recovery_mode(options.wal_recovery_mode), allow_2pc(options.allow_2pc), row_cache(options.row_cache), wal_filter(options.wal_filter), - fail_if_options_file_error(options.fail_if_options_file_error), dump_malloc_stats(options.dump_malloc_stats), avoid_flush_during_recovery(options.avoid_flush_during_recovery), allow_ingest_behind(options.allow_ingest_behind), @@ -801,7 +798,9 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) follower_catchup_retry_count(options.follower_catchup_retry_count), follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms), metadata_write_temperature(options.metadata_write_temperature), - wal_write_temperature(options.wal_write_temperature) { + wal_write_temperature(options.wal_write_temperature), + calculate_sst_write_lifetime_hint_set( + options.calculate_sst_write_lifetime_hint_set) { fs = env->GetFileSystem(); clock = env->GetSystemClock().get(); logger = info_log.get(); @@ -849,9 +848,6 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.max_log_file_size: %" ROCKSDB_PRIszt, max_log_file_size); - ROCKS_LOG_HEADER(log, - " Options.max_manifest_file_size: %" PRIu64, - max_manifest_file_size); ROCKS_LOG_HEADER( log, " Options.log_file_time_to_roll: %" ROCKSDB_PRIszt, log_file_time_to_roll); @@ -891,9 +887,6 @@ void ImmutableDBOptions::Dump(Logger* log) const { " " "Options.max_write_batch_group_size_bytes: %" PRIu64, max_write_batch_group_size_bytes); - ROCKS_LOG_HEADER( - log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt, - manifest_preallocation_size); ROCKS_LOG_HEADER(log, " Options.is_fd_close_on_exec: %d", is_fd_close_on_exec); ROCKS_LOG_HEADER(log, " Options.advise_random_on_open: %d", @@ -1024,24 +1017,7 @@ const std::string& ImmutableDBOptions::GetWalDir( } } -MutableDBOptions::MutableDBOptions() - : max_background_jobs(2), - max_background_compactions(-1), - max_subcompactions(0), - avoid_flush_during_shutdown(false), - writable_file_max_buffer_size(1024 * 1024), - delayed_write_rate(2 * 1024U * 1024U), - max_total_wal_size(0), - delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000), - stats_dump_period_sec(600), - stats_persist_period_sec(600), - stats_history_buffer_size(1024 * 1024), - max_open_files(-1), - bytes_per_sync(0), - wal_bytes_per_sync(0), - strict_bytes_per_sync(false), - compaction_readahead_size(0), - max_background_flushes(-1) {} +MutableDBOptions::MutableDBOptions() : MutableDBOptions(DBOptions{}) {} MutableDBOptions::MutableDBOptions(const DBOptions& options) : max_background_jobs(options.max_background_jobs), @@ -1062,6 +1038,9 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options) strict_bytes_per_sync(options.strict_bytes_per_sync), compaction_readahead_size(options.compaction_readahead_size), max_background_flushes(options.max_background_flushes), + max_manifest_file_size(options.max_manifest_file_size), + max_manifest_space_amp_pct(options.max_manifest_space_amp_pct), + manifest_preallocation_size(options.manifest_preallocation_size), daily_offpeak_time_utc(options.daily_offpeak_time_utc) {} void MutableDBOptions::Dump(Logger* log) const { @@ -1106,6 +1085,15 @@ void MutableDBOptions::Dump(Logger* log) const { compaction_readahead_size); ROCKS_LOG_HEADER(log, " Options.max_background_flushes: %d", max_background_flushes); + ROCKS_LOG_HEADER(log, + " Options.max_manifest_file_size: %" PRIu64, + max_manifest_file_size); + ROCKS_LOG_HEADER(log, + " Options.max_manifest_space_amp_pct: %d", + max_manifest_space_amp_pct); + ROCKS_LOG_HEADER( + log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt, + manifest_preallocation_size); ROCKS_LOG_HEADER(log, "Options.daily_offpeak_time_utc: %s", daily_offpeak_time_utc.c_str()); } diff --git a/options/db_options.h b/options/db_options.h index df0854f1dd61..cc978d907dbb 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -47,12 +47,10 @@ struct ImmutableDBOptions { size_t log_file_time_to_roll; size_t keep_log_file_num; size_t recycle_log_file_num; - uint64_t max_manifest_file_size; int table_cache_numshardbits; uint64_t WAL_ttl_seconds; uint64_t WAL_size_limit_MB; uint64_t max_write_batch_group_size_bytes; - size_t manifest_preallocation_size; bool allow_mmap_reads; bool allow_mmap_writes; bool use_direct_reads; @@ -72,12 +70,10 @@ struct ImmutableDBOptions { uint64_t write_thread_max_yield_usec; uint64_t write_thread_slow_yield_usec; bool skip_stats_update_on_db_open; - bool skip_checking_sst_file_sizes_on_db_open; WALRecoveryMode wal_recovery_mode; bool allow_2pc; std::shared_ptr row_cache; WalFilter* wal_filter; - bool fail_if_options_file_error; bool dump_malloc_stats; bool avoid_flush_during_recovery; bool allow_ingest_behind; @@ -107,6 +103,7 @@ struct ImmutableDBOptions { uint64_t follower_catchup_retry_wait_ms; Temperature metadata_write_temperature; Temperature wal_write_temperature; + CompactionStyleSet calculate_sst_write_lifetime_hint_set; // Beginning convenience/helper objects that are not part of the base // DBOptions @@ -146,6 +143,9 @@ struct MutableDBOptions { bool strict_bytes_per_sync; size_t compaction_readahead_size; int max_background_flushes; + uint64_t max_manifest_file_size; + int max_manifest_space_amp_pct; + size_t manifest_preallocation_size; std::string daily_offpeak_time_utc; }; diff --git a/options/options.cc b/options/options.cc index c1e68260a14d..d9f64f93d235 100644 --- a/options/options.cc +++ b/options/options.cc @@ -43,8 +43,6 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) : max_write_buffer_number(options.max_write_buffer_number), min_write_buffer_number_to_merge( options.min_write_buffer_number_to_merge), - max_write_buffer_number_to_maintain( - options.max_write_buffer_number_to_maintain), max_write_buffer_size_to_maintain( options.max_write_buffer_size_to_maintain), inplace_update_support(options.inplace_update_support), @@ -65,6 +63,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) level0_stop_writes_trigger(options.level0_stop_writes_trigger), target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), + target_file_size_is_upper_bound(options.target_file_size_is_upper_bound), level_compaction_dynamic_level_bytes( options.level_compaction_dynamic_level_bytes), max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), @@ -90,6 +89,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) paranoid_file_checks(options.paranoid_file_checks), force_consistency_checks(options.force_consistency_checks), report_bg_io_stats(options.report_bg_io_stats), + disallow_memtable_writes(options.disallow_memtable_writes), ttl(options.ttl), periodic_compaction_seconds(options.periodic_compaction_seconds), sample_for_compression(options.sample_for_compression), @@ -112,7 +112,10 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) blob_file_starting_level(options.blob_file_starting_level), blob_cache(options.blob_cache), prepopulate_blob_cache(options.prepopulate_blob_cache), - persist_user_defined_timestamps(options.persist_user_defined_timestamps) { + persist_user_defined_timestamps(options.persist_user_defined_timestamps), + memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger), + memtable_avg_op_scan_flush_trigger( + options.memtable_avg_op_scan_flush_trigger) { assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -191,8 +194,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.num_levels: %d", num_levels); ROCKS_LOG_HEADER(log, " Options.min_write_buffer_number_to_merge: %d", min_write_buffer_number_to_merge); - ROCKS_LOG_HEADER(log, " Options.max_write_buffer_number_to_maintain: %d", - max_write_buffer_number_to_maintain); ROCKS_LOG_HEADER(log, " Options.max_write_buffer_size_to_maintain: %" PRIu64, max_write_buffer_size_to_maintain); @@ -269,6 +270,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { target_file_size_base); ROCKS_LOG_HEADER(log, " Options.target_file_size_multiplier: %d", target_file_size_multiplier); + ROCKS_LOG_HEADER(log, + " Options.target_file_size_is_upper_bound: %d", + target_file_size_is_upper_bound); ROCKS_LOG_HEADER(log, " Options.max_bytes_for_level_base: %" PRIu64, max_bytes_for_level_base); @@ -286,6 +290,12 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.max_sequential_skip_in_iterations: %" PRIu64, max_sequential_skip_in_iterations); + ROCKS_LOG_HEADER(log, + " Options.memtable_op_scan_flush_trigger: %" PRIu32, + memtable_op_scan_flush_trigger); + ROCKS_LOG_HEADER(log, + " Options.memtable_avg_op_scan_flush_trigger: %" PRIu32, + memtable_avg_op_scan_flush_trigger); ROCKS_LOG_HEADER(log, " Options.max_compaction_bytes: %" PRIu64, max_compaction_bytes); @@ -352,6 +362,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { str_compaction_stop_style.c_str()); ROCKS_LOG_HEADER(log, "Options.compaction_options_universal.max_read_amp: %d", compaction_options_universal.max_read_amp); + ROCKS_LOG_HEADER( + log, "Options.compaction_options_universal.reduce_file_locking: %d", + compaction_options_universal.reduce_file_locking); ROCKS_LOG_HEADER( log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64, compaction_options_fifo.max_table_files_size); @@ -395,6 +408,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const { force_consistency_checks); ROCKS_LOG_HEADER(log, " Options.report_bg_io_stats: %d", report_bg_io_stats); + ROCKS_LOG_HEADER(log, " Options.disallow_memtable_writes: %d", + disallow_memtable_writes); ROCKS_LOG_HEADER(log, " Options.ttl: %" PRIu64, ttl); ROCKS_LOG_HEADER(log, @@ -451,6 +466,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const { experimental_mempurge_threshold); ROCKS_LOG_HEADER(log, " Options.memtable_max_range_deletions: %d", memtable_max_range_deletions); + ROCKS_LOG_HEADER(log, " Options.cf_allow_ingest_behind: %s", + cf_allow_ingest_behind ? "true" : "false"); } // ColumnFamilyOptions::Dump void Options::Dump(Logger* log) const { diff --git a/options/options_helper.cc b/options/options_helper.cc index fad122166a0a..addada94f927 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -99,13 +99,15 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.log_file_time_to_roll = immutable_db_options.log_file_time_to_roll; options.keep_log_file_num = immutable_db_options.keep_log_file_num; options.recycle_log_file_num = immutable_db_options.recycle_log_file_num; - options.max_manifest_file_size = immutable_db_options.max_manifest_file_size; + options.max_manifest_file_size = mutable_db_options.max_manifest_file_size; + options.max_manifest_space_amp_pct = + mutable_db_options.max_manifest_space_amp_pct; options.table_cache_numshardbits = immutable_db_options.table_cache_numshardbits; options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds; options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB; options.manifest_preallocation_size = - immutable_db_options.manifest_preallocation_size; + mutable_db_options.manifest_preallocation_size; options.allow_mmap_reads = immutable_db_options.allow_mmap_reads; options.allow_mmap_writes = immutable_db_options.allow_mmap_writes; options.use_direct_reads = immutable_db_options.use_direct_reads; @@ -147,14 +149,10 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options, immutable_db_options.write_thread_slow_yield_usec; options.skip_stats_update_on_db_open = immutable_db_options.skip_stats_update_on_db_open; - options.skip_checking_sst_file_sizes_on_db_open = - immutable_db_options.skip_checking_sst_file_sizes_on_db_open; options.wal_recovery_mode = immutable_db_options.wal_recovery_mode; options.allow_2pc = immutable_db_options.allow_2pc; options.row_cache = immutable_db_options.row_cache; options.wal_filter = immutable_db_options.wal_filter; - options.fail_if_options_file_error = - immutable_db_options.fail_if_options_file_error; options.dump_malloc_stats = immutable_db_options.dump_malloc_stats; options.avoid_flush_during_recovery = immutable_db_options.avoid_flush_during_recovery; @@ -199,6 +197,8 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options, immutable_db_options.metadata_write_temperature; options.wal_write_temperature = immutable_db_options.wal_write_temperature; options.compaction_service = immutable_db_options.compaction_service; + options.calculate_sst_write_lifetime_hint_set = + immutable_db_options.calculate_sst_write_lifetime_hint_set; } ColumnFamilyOptions BuildColumnFamilyOptions( @@ -232,6 +232,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->block_protection_bytes_per_key = moptions.block_protection_bytes_per_key; cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks; + cf_opts->memtable_veirfy_per_key_checksum_on_seek = + moptions.memtable_veirfy_per_key_checksum_on_seek; cf_opts->bottommost_file_compaction_delay = moptions.bottommost_file_compaction_delay; @@ -250,6 +252,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->max_compaction_bytes = moptions.max_compaction_bytes; cf_opts->target_file_size_base = moptions.target_file_size_base; cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier; + cf_opts->target_file_size_is_upper_bound = + moptions.target_file_size_is_upper_bound; cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base; cf_opts->max_bytes_for_level_multiplier = moptions.max_bytes_for_level_multiplier; @@ -268,6 +272,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->compaction_options_fifo = moptions.compaction_options_fifo; cf_opts->compaction_options_universal = moptions.compaction_options_universal; + cf_opts->verify_output_flags = moptions.verify_output_flags; + // Blob file related options cf_opts->enable_blob_files = moptions.enable_blob_files; cf_opts->min_blob_size = moptions.min_blob_size; @@ -293,12 +299,17 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->compression_opts = moptions.compression_opts; cf_opts->bottommost_compression = moptions.bottommost_compression; cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts; + cf_opts->compression_manager = moptions.compression_manager; cf_opts->sample_for_compression = moptions.sample_for_compression; cf_opts->compression_per_level = moptions.compression_per_level; cf_opts->last_level_temperature = moptions.last_level_temperature; cf_opts->default_write_temperature = moptions.default_write_temperature; cf_opts->memtable_max_range_deletions = moptions.memtable_max_range_deletions; cf_opts->uncache_aggressiveness = moptions.uncache_aggressiveness; + cf_opts->memtable_op_scan_flush_trigger = + moptions.memtable_op_scan_flush_trigger; + cf_opts->memtable_avg_op_scan_flush_trigger = + moptions.memtable_avg_op_scan_flush_trigger; } void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, @@ -311,8 +322,6 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory; cf_opts->min_write_buffer_number_to_merge = ioptions.min_write_buffer_number_to_merge; - cf_opts->max_write_buffer_number_to_maintain = - ioptions.max_write_buffer_number_to_maintain; cf_opts->max_write_buffer_size_to_maintain = ioptions.max_write_buffer_size_to_maintain; cf_opts->inplace_update_support = ioptions.inplace_update_support; @@ -326,6 +335,7 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, cf_opts->num_levels = ioptions.num_levels; cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits; cf_opts->force_consistency_checks = ioptions.force_consistency_checks; + cf_opts->disallow_memtable_writes = ioptions.disallow_memtable_writes; cf_opts->memtable_insert_with_hint_prefix_extractor = ioptions.memtable_insert_with_hint_prefix_extractor; cf_opts->cf_paths = ioptions.cf_paths; @@ -335,6 +345,7 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, cf_opts->persist_user_defined_timestamps = ioptions.persist_user_defined_timestamps; cf_opts->default_temperature = ioptions.default_temperature; + cf_opts->cf_allow_ingest_behind = ioptions.cf_allow_ingest_behind; // TODO(yhchiang): find some way to handle the following derived options // * max_file_size @@ -360,10 +371,9 @@ std::map {kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}}; std::map OptionsHelper::temperature_to_string = { - {Temperature::kUnknown, "kUnknown"}, - {Temperature::kHot, "kHot"}, - {Temperature::kWarm, "kWarm"}, - {Temperature::kCold, "kCold"}}; + {Temperature::kUnknown, "kUnknown"}, {Temperature::kHot, "kHot"}, + {Temperature::kWarm, "kWarm"}, {Temperature::kCool, "kCool"}, + {Temperature::kCold, "kCold"}, {Temperature::kIce, "kIce"}}; std::unordered_map OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum}, @@ -382,6 +392,133 @@ std::unordered_map {"kLZ4HCCompression", kLZ4HCCompression}, {"kXpressCompression", kXpressCompression}, {"kZSTD", kZSTD}, + {"kCustomCompression80", kCustomCompression80}, + {"kCustomCompression81", kCustomCompression81}, + {"kCustomCompression82", kCustomCompression82}, + {"kCustomCompression83", kCustomCompression83}, + {"kCustomCompression84", kCustomCompression84}, + {"kCustomCompression85", kCustomCompression85}, + {"kCustomCompression86", kCustomCompression86}, + {"kCustomCompression87", kCustomCompression87}, + {"kCustomCompression88", kCustomCompression88}, + {"kCustomCompression89", kCustomCompression89}, + {"kCustomCompression8A", kCustomCompression8A}, + {"kCustomCompression8B", kCustomCompression8B}, + {"kCustomCompression8C", kCustomCompression8C}, + {"kCustomCompression8D", kCustomCompression8D}, + {"kCustomCompression8E", kCustomCompression8E}, + {"kCustomCompression8F", kCustomCompression8F}, + {"kCustomCompression90", kCustomCompression90}, + {"kCustomCompression91", kCustomCompression91}, + {"kCustomCompression92", kCustomCompression92}, + {"kCustomCompression93", kCustomCompression93}, + {"kCustomCompression94", kCustomCompression94}, + {"kCustomCompression95", kCustomCompression95}, + {"kCustomCompression96", kCustomCompression96}, + {"kCustomCompression97", kCustomCompression97}, + {"kCustomCompression98", kCustomCompression98}, + {"kCustomCompression99", kCustomCompression99}, + {"kCustomCompression9A", kCustomCompression9A}, + {"kCustomCompression9B", kCustomCompression9B}, + {"kCustomCompression9C", kCustomCompression9C}, + {"kCustomCompression9D", kCustomCompression9D}, + {"kCustomCompression9E", kCustomCompression9E}, + {"kCustomCompression9F", kCustomCompression9F}, + {"kCustomCompressionA0", kCustomCompressionA0}, + {"kCustomCompressionA1", kCustomCompressionA1}, + {"kCustomCompressionA2", kCustomCompressionA2}, + {"kCustomCompressionA3", kCustomCompressionA3}, + {"kCustomCompressionA4", kCustomCompressionA4}, + {"kCustomCompressionA5", kCustomCompressionA5}, + {"kCustomCompressionA6", kCustomCompressionA6}, + {"kCustomCompressionA7", kCustomCompressionA7}, + {"kCustomCompressionA8", kCustomCompressionA8}, + {"kCustomCompressionA9", kCustomCompressionA9}, + {"kCustomCompressionAA", kCustomCompressionAA}, + {"kCustomCompressionAB", kCustomCompressionAB}, + {"kCustomCompressionAC", kCustomCompressionAC}, + {"kCustomCompressionAD", kCustomCompressionAD}, + {"kCustomCompressionAE", kCustomCompressionAE}, + {"kCustomCompressionAF", kCustomCompressionAF}, + {"kCustomCompressionB0", kCustomCompressionB0}, + {"kCustomCompressionB1", kCustomCompressionB1}, + {"kCustomCompressionB2", kCustomCompressionB2}, + {"kCustomCompressionB3", kCustomCompressionB3}, + {"kCustomCompressionB4", kCustomCompressionB4}, + {"kCustomCompressionB5", kCustomCompressionB5}, + {"kCustomCompressionB6", kCustomCompressionB6}, + {"kCustomCompressionB7", kCustomCompressionB7}, + {"kCustomCompressionB8", kCustomCompressionB8}, + {"kCustomCompressionB9", kCustomCompressionB9}, + {"kCustomCompressionBA", kCustomCompressionBA}, + {"kCustomCompressionBB", kCustomCompressionBB}, + {"kCustomCompressionBC", kCustomCompressionBC}, + {"kCustomCompressionBD", kCustomCompressionBD}, + {"kCustomCompressionBE", kCustomCompressionBE}, + {"kCustomCompressionBF", kCustomCompressionBF}, + {"kCustomCompressionC0", kCustomCompressionC0}, + {"kCustomCompressionC1", kCustomCompressionC1}, + {"kCustomCompressionC2", kCustomCompressionC2}, + {"kCustomCompressionC3", kCustomCompressionC3}, + {"kCustomCompressionC4", kCustomCompressionC4}, + {"kCustomCompressionC5", kCustomCompressionC5}, + {"kCustomCompressionC6", kCustomCompressionC6}, + {"kCustomCompressionC7", kCustomCompressionC7}, + {"kCustomCompressionC8", kCustomCompressionC8}, + {"kCustomCompressionC9", kCustomCompressionC9}, + {"kCustomCompressionCA", kCustomCompressionCA}, + {"kCustomCompressionCB", kCustomCompressionCB}, + {"kCustomCompressionCC", kCustomCompressionCC}, + {"kCustomCompressionCD", kCustomCompressionCD}, + {"kCustomCompressionCE", kCustomCompressionCE}, + {"kCustomCompressionCF", kCustomCompressionCF}, + {"kCustomCompressionD0", kCustomCompressionD0}, + {"kCustomCompressionD1", kCustomCompressionD1}, + {"kCustomCompressionD2", kCustomCompressionD2}, + {"kCustomCompressionD3", kCustomCompressionD3}, + {"kCustomCompressionD4", kCustomCompressionD4}, + {"kCustomCompressionD5", kCustomCompressionD5}, + {"kCustomCompressionD6", kCustomCompressionD6}, + {"kCustomCompressionD7", kCustomCompressionD7}, + {"kCustomCompressionD8", kCustomCompressionD8}, + {"kCustomCompressionD9", kCustomCompressionD9}, + {"kCustomCompressionDA", kCustomCompressionDA}, + {"kCustomCompressionDB", kCustomCompressionDB}, + {"kCustomCompressionDC", kCustomCompressionDC}, + {"kCustomCompressionDD", kCustomCompressionDD}, + {"kCustomCompressionDE", kCustomCompressionDE}, + {"kCustomCompressionDF", kCustomCompressionDF}, + {"kCustomCompressionE0", kCustomCompressionE0}, + {"kCustomCompressionE1", kCustomCompressionE1}, + {"kCustomCompressionE2", kCustomCompressionE2}, + {"kCustomCompressionE3", kCustomCompressionE3}, + {"kCustomCompressionE4", kCustomCompressionE4}, + {"kCustomCompressionE5", kCustomCompressionE5}, + {"kCustomCompressionE6", kCustomCompressionE6}, + {"kCustomCompressionE7", kCustomCompressionE7}, + {"kCustomCompressionE8", kCustomCompressionE8}, + {"kCustomCompressionE9", kCustomCompressionE9}, + {"kCustomCompressionEA", kCustomCompressionEA}, + {"kCustomCompressionEB", kCustomCompressionEB}, + {"kCustomCompressionEC", kCustomCompressionEC}, + {"kCustomCompressionED", kCustomCompressionED}, + {"kCustomCompressionEE", kCustomCompressionEE}, + {"kCustomCompressionEF", kCustomCompressionEF}, + {"kCustomCompressionF0", kCustomCompressionF0}, + {"kCustomCompressionF1", kCustomCompressionF1}, + {"kCustomCompressionF2", kCustomCompressionF2}, + {"kCustomCompressionF3", kCustomCompressionF3}, + {"kCustomCompressionF4", kCustomCompressionF4}, + {"kCustomCompressionF5", kCustomCompressionF5}, + {"kCustomCompressionF6", kCustomCompressionF6}, + {"kCustomCompressionF7", kCustomCompressionF7}, + {"kCustomCompressionF8", kCustomCompressionF8}, + {"kCustomCompressionF9", kCustomCompressionF9}, + {"kCustomCompressionFA", kCustomCompressionFA}, + {"kCustomCompressionFB", kCustomCompressionFB}, + {"kCustomCompressionFC", kCustomCompressionFC}, + {"kCustomCompressionFD", kCustomCompressionFD}, + {"kCustomCompressionFE", kCustomCompressionFE}, {"kDisableCompressionOption", kDisableCompressionOption}}; const std::vector& GetSupportedCompressions() { @@ -564,7 +701,6 @@ bool SerializeSingleOptionHelper(const void* opt_address, return SerializeEnum( compression_type_string_map, *(static_cast(opt_address)), value); - break; case OptionType::kChecksumType: return SerializeEnum( checksum_type_string_map, @@ -832,10 +968,9 @@ std::unordered_map std::unordered_map OptionsHelper::temperature_string_map = { - {"kUnknown", Temperature::kUnknown}, - {"kHot", Temperature::kHot}, - {"kWarm", Temperature::kWarm}, - {"kCold", Temperature::kCold}}; + {"kUnknown", Temperature::kUnknown}, {"kHot", Temperature::kHot}, + {"kWarm", Temperature::kWarm}, {"kCool", Temperature::kCool}, + {"kCold", Temperature::kCold}, {"kIce", Temperature::kIce}}; std::unordered_map OptionsHelper::prepopulate_blob_cache_string_map = { @@ -907,7 +1042,7 @@ Status OptionTypeInfo::Parse(const ConfigOptions& config_options, : value; if (opt_ptr == nullptr) { - return Status::NotFound("Could not find option", opt_name); + return Status::NotFound("Nullptr option", opt_name); } else if (parse_func_ != nullptr) { ConfigOptions copy = config_options; copy.invoke_prepare_options = false; diff --git a/options/options_helper.h b/options/options_helper.h index f03179066eaf..74e953b9f507 100644 --- a/options/options_helper.h +++ b/options/options_helper.h @@ -72,6 +72,9 @@ std::unique_ptr CFOptionsAsConfigurable( Status StringToMap(const std::string& opts_str, std::unordered_map* opts_map); +Status GetStringFromCompressionType(std::string* compression_str, + CompressionType compression_type); + struct OptionsHelper { static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/; static const std::string kDBOptionsName /*= "DBOptions" */; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index d6660908d8b8..3c12a9e859a9 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -129,6 +129,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { sizeof(CacheUsageOptions)}, {offsetof(struct BlockBasedTableOptions, filter_policy), sizeof(std::shared_ptr)}, + {offsetof(struct BlockBasedTableOptions, user_defined_index_factory), + sizeof(std::shared_ptr)}, }; // In this test, we catch a new option of BlockBasedTableOptions that is not @@ -180,6 +182,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "pin_l0_filter_and_index_blocks_in_cache=1;" "pin_top_level_index_and_filter=1;" "index_type=kHashSearch;" + "index_block_search_type=kBinary;" "data_block_index_type=kDataBlockBinaryAndHash;" "index_shortening=kNoShortening;" "data_block_hash_table_util_ratio=0.75;" @@ -198,10 +201,13 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "verify_compression=true;read_amp_bytes_per_bit=0;" "enable_index_compression=false;" "block_align=true;" + "super_block_alignment_size=65536;" + "super_block_alignment_space_overhead_ratio=4096;" "max_auto_readahead_size=0;" "prepopulate_block_cache=kDisable;" "initial_auto_readahead_size=0;" - "num_file_reads_for_auto_readahead=0", + "num_file_reads_for_auto_readahead=0;" + "fail_if_no_udi_on_open=true", new_bbto)); ASSERT_EQ(unset_bytes_base, @@ -272,8 +278,8 @@ TEST_F(OptionsSettableTest, TablePropertiesAllFieldsSettable) { "property_collectors_names=;prefix_extractor_name=;db_host_id=" "64625F686F73745F6964;db_session_id=64625F73657373696F6E5F6964;creation_" "time=0;num_data_blocks=123;index_value_is_delta_encoded=0;top_level_" - "index_" - "size=0;data_size=100;merge_operator_name=;index_partitions=0;file_" + "index_size=0;data_size=100;uncompressed_data_size=1234;" + "merge_operator_name=;index_partitions=0;file_" "creation_time=0;raw_value_size=0;index_size=200;user_collected_" "properties={757365725F6B6579=757365725F76616C7565;};tail_start_offset=0;" "seqno_to_time_mapping=;raw_key_size=0;slow_compression_estimated_data_" @@ -286,7 +292,8 @@ TEST_F(OptionsSettableTest, TablePropertiesAllFieldsSettable) { "0;column_family_" "name=64656661756C74;user_defined_timestamps_persisted=1;num_entries=100;" "external_sst_file_global_seqno_offset=0;num_merge_operands=0;index_key_" - "is_user_key=0;key_largest_seqno=18446744073709551615;", + "is_user_key=0;key_largest_seqno=18446744073709551615;key_smallest_seqno=" + "18;", new_tp)); // All bytes are set from the parse @@ -342,6 +349,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { {offsetof(struct DBOptions, compaction_service), sizeof(std::shared_ptr)}, {offsetof(struct DBOptions, daily_offpeak_time_utc), sizeof(std::string)}, + {offsetof(struct DBOptions, calculate_sst_write_lifetime_hint_set), + sizeof(CompactionStyleSet)}, }; char* options_ptr = new char[sizeof(DBOptions)]; @@ -398,8 +407,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "compaction_readahead_size=0;" "keep_log_file_num=4890;" "skip_stats_update_on_db_open=false;" - "skip_checking_sst_file_sizes_on_db_open=false;" "max_manifest_file_size=4295009941;" + "max_manifest_space_amp_pct=321;" "db_log_dir=path/to/db_log_dir;" "writable_file_max_buffer_size=1048576;" "paranoid_checks=true;" @@ -431,7 +440,6 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "use_direct_io_for_flush_and_compaction=false;" "max_log_file_size=4607;" "advise_random_on_open=true;" - "fail_if_options_file_error=false;" "enable_pipelined_write=false;" "unordered_write=false;" "allow_concurrent_memtable_write=true;" @@ -500,7 +508,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { // ColumnFamilyOptions. const OffsetGap kColumnFamilyOptionsExcluded = { {offsetof(struct ColumnFamilyOptions, inplace_callback), - sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))}, + sizeof(UpdateStatus (*)(char*, uint32_t*, Slice, std::string*))}, {offsetof(struct ColumnFamilyOptions, memtable_insert_with_hint_prefix_extractor), sizeof(std::shared_ptr)}, @@ -529,10 +537,10 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { sizeof(const CompactionFilter*)}, {offsetof(struct ColumnFamilyOptions, compaction_filter_factory), sizeof(std::shared_ptr)}, + {offsetof(struct ColumnFamilyOptions, compression_manager), + sizeof(std::shared_ptr)}, {offsetof(struct ColumnFamilyOptions, prefix_extractor), sizeof(std::shared_ptr)}, - {offsetof(struct ColumnFamilyOptions, snap_refresh_nanos), - sizeof(uint64_t)}, {offsetof(struct ColumnFamilyOptions, table_factory), sizeof(std::shared_ptr)}, {offsetof(struct ColumnFamilyOptions, cf_paths), @@ -601,6 +609,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "max_sequential_skip_in_iterations=4294971408;" "arena_block_size=1893;" "target_file_size_multiplier=35;" + "target_file_size_is_upper_bound=false;" "min_write_buffer_number_to_merge=9;" "max_write_buffer_number=84;" "write_buffer_size=1653;" @@ -618,13 +627,13 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "strategy=7;max_dict_bytes=8;level=9;window_bits=10;max_compressed_bytes_" "per_kb=876;checksum=true};" "bottommost_compression=kDisableCompressionOption;" + "compression_manager=BuiltinV2;" "level0_stop_writes_trigger=33;" "num_levels=99;" "level0_slowdown_writes_trigger=22;" "level0_file_num_compaction_trigger=14;" "compaction_filter=urxcqstuwnCompactionFilter;" "soft_pending_compaction_bytes_limit=0;" - "max_write_buffer_number_to_maintain=84;" "max_write_buffer_size_to_maintain=2147483648;" "merge_operator=aabcxehazrMergeOperator;" "memtable_prefix_bloom_size_ratio=0.4642;" @@ -644,6 +653,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "hard_pending_compaction_bytes_limit=0;" "disable_auto_compactions=false;" "report_bg_io_stats=true;" + "disallow_memtable_writes=true;" "ttl=60;" "periodic_compaction_seconds=3600;" "sample_for_compression=0;" @@ -665,7 +675,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "preserve_internal_time_seconds=86400;" "compaction_options_fifo={max_table_files_size=3;allow_" "compaction=true;age_for_warm=0;file_temperature_age_thresholds={{" - "temperature=kCold;age=12345}};};" + "temperature=kCold;age=12345}};max_data_files_size=1073741824;" + "use_kv_ratio_compaction=false;};" "blob_cache=1M;" "memtable_protection_bytes_per_key=2;" "persist_user_defined_timestamps=true;" @@ -673,7 +684,12 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "memtable_max_range_deletions=999999;" "bottommost_file_compaction_delay=7200;" "uncache_aggressiveness=1234;" - "paranoid_memory_checks=1;", + "paranoid_memory_checks=1;" + "memtable_veirfy_per_key_checksum_on_seek=1;" + "memtable_op_scan_flush_trigger=123;" + "memtable_avg_op_scan_flush_trigger=12;" + "cf_allow_ingest_behind=1;" + "verify_output_flags=2049;", new_options)); ASSERT_NE(new_options->blob_cache.get(), nullptr); @@ -697,6 +713,11 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { new_options->compaction_options_fifo.file_temperature_age_thresholds[0] .age, 12345); + // TODO: try to enhance ObjectLibrary to support singletons + // ASSERT_EQ(new_options->compression_manager, + // GetBuiltinV2CompressionManager()); + ASSERT_STREQ(new_options->compression_manager->Name(), + GetBuiltinV2CompressionManager()->Name()); ColumnFamilyOptions rnd_filled_options = *new_options; @@ -716,6 +737,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { sizeof(std::vector)}, {offsetof(struct MutableCFOptions, compaction_options_fifo), sizeof(struct CompactionOptionsFIFO)}, + {offsetof(struct MutableCFOptions, compression_manager), + sizeof(std::shared_ptr)}, {offsetof(struct MutableCFOptions, compression_per_level), sizeof(std::vector)}, {offsetof(struct MutableCFOptions, max_file_size), diff --git a/options/options_test.cc b/options/options_test.cc index 159cfec85570..1828dc9d86a3 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -160,6 +160,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"keep_log_file_num", "39"}, {"recycle_log_file_num", "5"}, {"max_manifest_file_size", "40"}, + {"max_manifest_space_amp_pct", "42"}, {"table_cache_numshardbits", "41"}, {"WAL_ttl_seconds", "43"}, {"WAL_size_limit_MB", "44"}, @@ -200,7 +201,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.write_buffer_size, 1U); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2); ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3); - ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99); ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999); ASSERT_EQ(new_cf_opt.compression, kSnappyCompression); ASSERT_EQ(new_cf_opt.compression_per_level.size(), 8U); @@ -342,7 +342,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U); ASSERT_EQ(new_db_opt.keep_log_file_num, 39U); ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U); - ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast(40)); + ASSERT_EQ(new_db_opt.max_manifest_file_size, uint64_t{40}); + ASSERT_EQ(new_db_opt.max_manifest_space_amp_pct, 42); ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41); ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast(43)); ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast(44)); @@ -1721,15 +1722,31 @@ TEST_F(OptionsTest, MutableCFOptions) { ASSERT_OK(GetColumnFamilyOptionsFromString( config_options, cf_opts, - "paranoid_file_checks=true; block_based_table_factory.block_align=false; " + "paranoid_file_checks=true; " + "verify_output_flags=2049; " + "block_based_table_factory.block_align=false; " + "block_based_table_factory.super_block_alignment_size=65536; " + "block_based_table_factory.super_block_alignment_space_overhead_ratio=" + "4096; " "block_based_table_factory.block_size=8192;", &cf_opts)); ASSERT_TRUE(cf_opts.paranoid_file_checks); + ASSERT_NE( + (cf_opts.verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum), + VerifyOutputFlags::kVerifyNone); + ASSERT_NE((cf_opts.verify_output_flags & + VerifyOutputFlags::kEnableForRemoteCompaction), + VerifyOutputFlags::kVerifyNone); + ASSERT_EQ((cf_opts.verify_output_flags & + VerifyOutputFlags::kEnableForLocalCompaction), + VerifyOutputFlags::kVerifyNone); ASSERT_NE(cf_opts.table_factory.get(), nullptr); auto* bbto = cf_opts.table_factory->GetOptions(); ASSERT_NE(bbto, nullptr); ASSERT_EQ(bbto->block_size, 8192); ASSERT_EQ(bbto->block_align, false); + ASSERT_EQ(bbto->super_block_alignment_size, 65536); + ASSERT_EQ(bbto->super_block_alignment_space_overhead_ratio, 4096); std::unordered_map unused_opts; ASSERT_OK(GetColumnFamilyOptionsFromMap( config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts)); @@ -2032,7 +2049,7 @@ TEST_F(OptionsTest, GetStringFromCompressionType) { ASSERT_EQ(res, "kZlibCompression"); ASSERT_NOK( - GetStringFromCompressionType(&res, static_cast(-10))); + GetStringFromCompressionType(&res, static_cast(0x7F))); } TEST_F(OptionsTest, OnlyMutableDBOptions) { @@ -2400,6 +2417,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"max_compaction_bytes", "21"}, {"soft_rate_limit", "1.1"}, {"hard_rate_limit", "2.1"}, + {"snap_refresh_nanos", "1000000"}, {"rate_limit_delay_max_milliseconds", "100"}, {"hard_pending_compaction_bytes_limit", "211"}, {"arena_block_size", "22"}, @@ -2464,6 +2482,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"keep_log_file_num", "39"}, {"recycle_log_file_num", "5"}, {"max_manifest_file_size", "40"}, + {"max_manifest_space_amp_pct", "42"}, {"table_cache_numshardbits", "41"}, {"WAL_ttl_seconds", "43"}, {"WAL_size_limit_MB", "44"}, @@ -2498,7 +2517,6 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.write_buffer_size, 1U); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2); ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3); - ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99); ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999); ASSERT_EQ(new_cf_opt.compression, kSnappyCompression); ASSERT_EQ(new_cf_opt.compression_per_level.size(), 8U); @@ -2578,6 +2596,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); ASSERT_EQ(new_cf_opt.experimental_mempurge_threshold, 0.003); + ASSERT_EQ(new_cf_opt.verify_output_flags, VerifyOutputFlags::kVerifyNone); ASSERT_EQ(new_cf_opt.enable_blob_files, true); ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10); ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30); @@ -2650,7 +2669,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U); ASSERT_EQ(new_db_opt.keep_log_file_num, 39U); ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U); - ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast(40)); + ASSERT_EQ(new_db_opt.max_manifest_file_size, uint64_t{40}); + ASSERT_EQ(new_db_opt.max_manifest_space_amp_pct, 42); ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41); ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast(43)); ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast(44)); diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h index d89d0b8c38f2..1fca386c01c3 100644 --- a/port/jemalloc_helper.h +++ b/port/jemalloc_helper.h @@ -59,33 +59,31 @@ static inline bool HasJemalloc() { return true; } // Declare non-standard jemalloc APIs as weak symbols. We can null-check these // symbols to detect whether jemalloc is linked with the binary. -extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * +extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW* mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) __attribute__((__weak__)); -extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * -rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__)); -extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int) +extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW* +rallocx(void*, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW xallocx(void*, size_t, size_t, int) __attribute__((__weak__)); -extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int) - JEMALLOC_ATTR(pure) __attribute__((__weak__)); -extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__)); -extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int) +extern "C" size_t JEMALLOC_NOTHROW sallocx(const void*, int) JEMALLOC_ATTR(pure) + __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW dallocx(void*, int) __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW sdallocx(void*, size_t, int) __attribute__((__weak__)); extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure) __attribute__((__weak__)); -extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *, +extern "C" int JEMALLOC_NOTHROW mallctl(const char*, void*, size_t*, void*, size_t) __attribute__((__weak__)); -extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *, - size_t *) - __attribute__((__weak__)); -extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *, - size_t *, void *, size_t) +extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char*, size_t*, size_t*) __attribute__((__weak__)); -extern "C" void JEMALLOC_NOTHROW -malloc_stats_print(void (*)(void *, const char *), void *, const char *) +extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t*, size_t, void*, + size_t*, void*, size_t) __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW malloc_stats_print( + void (*)(void*, const char*), void*, const char*) __attribute__((__weak__)); extern "C" size_t JEMALLOC_NOTHROW -malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW +malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*) JEMALLOC_CXX_THROW __attribute__((__weak__)); // Check if Jemalloc is linked with the binary. Note the main program might be diff --git a/port/lang.h b/port/lang.h index ab79f9d22a75..f0418cedaeda 100644 --- a/port/lang.h +++ b/port/lang.h @@ -69,6 +69,10 @@ constexpr bool kMustFreeHeapAllocations = false; #define TSAN_SUPPRESSION #endif // TSAN_SUPPRESSION +// Fail in debug build with a useful message, for automatically grouping +// related failures +#define DEBUG_FAIL(msg) assert(false && msg) + // Compile-time CPU feature testing compatibility // // A way to be extra sure these defines have been included. diff --git a/port/mmap.cc b/port/mmap.cc index 36e8f32617fb..36977f17b9f4 100644 --- a/port/mmap.cc +++ b/port/mmap.cc @@ -43,7 +43,7 @@ MemMapping& MemMapping::operator=(MemMapping&& other) noexcept { return *this; } this->~MemMapping(); - std::memcpy(this, &other, sizeof(*this)); + std::memcpy(static_cast(this), &other, sizeof(*this)); new (&other) MemMapping(); return *this; } diff --git a/port/port_example.h b/port/port_example.h index f9e94d00f865..6bbb5b2e330b 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -74,28 +74,5 @@ using OnceType = intptr_t; #define LEVELDB_ONCE_INIT 0 void InitOnce(port::OnceType*, void (*initializer)()); -// ------------------ Compression ------------------- - -// Store the snappy compression of "input[0,input_length-1]" in *output. -// Returns false if snappy is not supported by this port. -bool Snappy_Compress(const char* input, size_t input_length, - std::string* output); - -// If input[0,input_length-1] looks like a valid snappy compressed -// buffer, store the size of the uncompressed data in *result and -// return true. Else return false. -bool Snappy_GetUncompressedLength(const char* input, size_t length, - size_t* result); - -// Attempt to snappy uncompress input[0,input_length-1] into *output. -// Returns true if successful, false if the input is invalid lightweight -// compressed data. -// -// REQUIRES: at least the first "n" bytes of output[] must be writable -// where "n" is the result of a successful call to -// Snappy_GetUncompressedLength. -bool Snappy_Uncompress(const char* input_data, size_t input_length, - char* output); - } // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/port/port_posix.cc b/port/port_posix.cc index 7042a710dc84..1159d0bf8a63 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -220,8 +220,9 @@ int GetMaxOpenFiles() { return std::numeric_limits::max(); } return static_cast(no_files_limit.rlim_cur); -#endif +#else return -1; +#endif } void* cacheline_aligned_alloc(size_t size) { diff --git a/port/win/io_win.cc b/port/win/io_win.cc index 2ba64b326554..63e5d6a7e16e 100644 --- a/port/win/io_win.cc +++ b/port/win/io_win.cc @@ -242,6 +242,16 @@ size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { return GetUniqueIdFromFile(hFile_, id, max_size); } +IOStatus WinMmapReadableFile::GetFileSize(uint64_t* size) { + LARGE_INTEGER fileSize; + if (GetFileSizeEx(hFile_, &fileSize)) { + *size = fileSize.QuadPart; + return IOStatus::OK(); + } else { + return IOStatus::IOError("Failed to get file size", filename_); + } +} + /////////////////////////////////////////////////////////////////////////////// /// WinMmapFile @@ -735,6 +745,16 @@ size_t WinRandomAccessFile::GetRequiredBufferAlignment() const { return GetAlignment(); } +IOStatus WinRandomAccessFile::GetFileSize(uint64_t* size) { + LARGE_INTEGER fileSize; + if (GetFileSizeEx(hFile_, &fileSize)) { + *size = fileSize.QuadPart; + return IOStatus::OK(); + } else { + return IOStatus::IOError("Failed to get file size", filename_); + } +} + ///////////////////////////////////////////////////////////////////////////// // WinWritableImpl // diff --git a/port/win/io_win.h b/port/win/io_win.h index e1a6197ce86b..29511d47ee68 100644 --- a/port/win/io_win.h +++ b/port/win/io_win.h @@ -152,6 +152,8 @@ class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile { IOStatus InvalidateCache(size_t offset, size_t length) override; size_t GetUniqueId(char* id, size_t max_size) const override; + + IOStatus GetFileSize(uint64_t* file_size) override; }; // We preallocate and use memcpy to append new @@ -292,6 +294,8 @@ class WinRandomAccessFile IOStatus InvalidateCache(size_t offset, size_t length) override; size_t GetRequiredBufferAlignment() const override; + + IOStatus GetFileSize(uint64_t* file_size) override; }; // This is a sequential write class. It has been mimicked (as others) after diff --git a/port/win/xpress_win.cc b/port/win/xpress_win.cc index 21904d502674..a90179bc1283 100644 --- a/port/win/xpress_win.cc +++ b/port/win/xpress_win.cc @@ -125,6 +125,57 @@ bool Compress(const char* input, size_t length, std::string* output) { return true; } +size_t CompressWithMaxSize(const char* input, size_t length, char* output, + size_t max_output_size) { + assert(input != nullptr); + if (max_output_size == 0) { + return 0; + } + assert(output != nullptr); + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + + COMPRESSOR_HANDLE compressor = NULL; + + BOOL success = + CreateCompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &compressor); // Handle + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Compressor LastError: " + << GetLastError() << std::endl; +#endif + return 0; + } + + std::unique_ptr compressorGuard( + compressor, CloseCompressorFun); + + SIZE_T compressed_size = 0; + // Compress + success = ::Compress(compressor, // Compressor Handle + const_cast(input), // Input buffer + length, // Uncompressed data size + output, // Compressed Buffer + max_output_size, // Compressed Buffer size + &compressed_size); // Compressed Data size + + if (!success) { +#ifdef _DEBUG + auto error = GetLastError(); + if (error != ERROR_INSUFFICIENT_BUFFER) { + std::cerr << "XPRESS: Failed to compress LastError " << error + << std::endl; + } +#endif + return 0; + } else { + return compressed_size; + } +} + char* Decompress(const char* input_data, size_t input_length, size_t* uncompressed_size) { assert(input_data != nullptr); @@ -151,7 +202,7 @@ char* Decompress(const char* input_data, size_t input_length, return nullptr; } - std::unique_ptr compressorGuard( + std::unique_ptr decompressorGuard( decompressor, CloseDecompressorFun); SIZE_T decompressedBufferSize = 0; @@ -201,6 +252,104 @@ char* Decompress(const char* input_data, size_t input_length, // Return the raw buffer to the caller supporting the tradition return outputBuffer.release(); } + +int64_t GetDecompressedSize(const char* input_data, size_t input_length) { + assert(input_data != nullptr); + + if (input_length == 0) { + return 0; + } + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + + DECOMPRESSOR_HANDLE decompressor = NULL; + + BOOL success = + CreateDecompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &decompressor); // Handle + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Decompressor LastError " + << GetLastError() << std::endl; +#endif + return -1; + } + + std::unique_ptr decompressorGuard( + decompressor, CloseDecompressorFun); + + SIZE_T decompressedBufferSize = 0; + + success = ::Decompress(decompressor, // Compressor Handle + const_cast(input_data), // Compressed data + input_length, // Compressed data size + NULL, // Buffer set to NULL + 0, // Buffer size set to 0 + &decompressedBufferSize); // Decompressed Data size + + assert(!success); + auto lastError = GetLastError(); + + if (lastError != ERROR_INSUFFICIENT_BUFFER) { +#ifdef _DEBUG + std::cerr + << "XPRESS: Failed to estimate decompressed buffer size LastError " + << lastError << std::endl; +#endif + return -1; + } + + assert(decompressedBufferSize > 0); + return static_cast(decompressedBufferSize); +} + +int64_t DecompressToBuffer(const char* input, size_t input_length, char* output, + size_t output_length) { + assert(input != nullptr); + assert(output != nullptr); + + if (input_length == 0) { + return 0; + } + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + + DECOMPRESSOR_HANDLE decompressor = NULL; + + BOOL success = + CreateDecompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &decompressor); // Handle + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Decompressor LastError " + << GetLastError() << std::endl; +#endif + return -1; + } + + std::unique_ptr decompressorGuard( + decompressor, CloseDecompressorFun); + + SIZE_T decompressedDataSize = 0; + + success = ::Decompress(decompressor, const_cast(input), input_length, + output, output_length, &decompressedDataSize); + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to decompress LastError " << GetLastError() + << std::endl; +#endif + return -1; + } + + return static_cast(decompressedDataSize); +} + } // namespace xpress } // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/port/win/xpress_win.h b/port/win/xpress_win.h index 187adffa658a..00cc1b9fc3dc 100644 --- a/port/win/xpress_win.h +++ b/port/win/xpress_win.h @@ -19,8 +19,18 @@ namespace xpress { bool Compress(const char* input, size_t length, std::string* output); +// Returns written size or 0 on failure including if buffer is too small. +size_t CompressWithMaxSize(const char* input, size_t length, char* output, + size_t max_output_size); + char* Decompress(const char* input_data, size_t input_length, size_t* uncompressed_size); + +int64_t GetDecompressedSize(const char* input, size_t input_length); + +int64_t DecompressToBuffer(const char* input, size_t input_length, char* output, + size_t output_length); + } // namespace xpress } // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/src.mk b/src.mk index 3f1de6688684..a77efc8f6123 100644 --- a/src.mk +++ b/src.mk @@ -80,6 +80,7 @@ LIB_SOURCES = \ db/memtable_list.cc \ db/merge_helper.cc \ db/merge_operator.cc \ + db/multi_scan.cc \ db/output_validator.cc \ db/periodic_task_scheduler.cc \ db/range_del_aggregator.cc \ @@ -205,7 +206,7 @@ LIB_SOURCES = \ table/cuckoo/cuckoo_table_builder.cc \ table/cuckoo/cuckoo_table_factory.cc \ table/cuckoo/cuckoo_table_reader.cc \ - table/external_table_reader.cc \ + table/external_table.cc \ table/format.cc \ table/get_context.cc \ table/iterator.cc \ @@ -237,6 +238,7 @@ LIB_SOURCES = \ trace_replay/block_cache_tracer.cc \ trace_replay/io_tracer.cc \ util/async_file_reader.cc \ + util/auto_tune_compressor.cc \ util/build_version.cc \ util/cleanable.cc \ util/coding.cc \ @@ -256,11 +258,13 @@ LIB_SOURCES = \ util/ribbon_config.cc \ util/slice.cc \ util/file_checksum_helper.cc \ + util/simple_mixed_compressor.cc \ util/status.cc \ util/stderr_logger.cc \ util/string_util.cc \ util/thread_local.cc \ util/threadpool_imp.cc \ + util/io_dispatcher_imp.cc \ util/udt_util.cc \ util/write_batch_util.cc \ util/xxhash.cc \ @@ -364,6 +368,7 @@ RANGE_TREE_SOURCES =\ utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc TOOL_LIB_SOURCES = \ + db_stress_tool/db_stress_compression_manager.cc \ tools/io_tracer_parser_tool.cc \ tools/ldb_cmd.cc \ tools/ldb_tool.cc \ @@ -382,19 +387,23 @@ BENCH_LIB_SOURCES = \ tools/tool_hooks.cc \ tools/simulated_hybrid_file_system.cc \ -CACHE_BENCH_LIB_SOURCES = \ +CACHE_BENCH_LIB_SOURCES = \ cache/cache_bench_tool.cc \ +POINT_LOCK_BENCH_LIB_SOURCES = \ + utilities/transactions/lock/point/point_lock_bench_tool.cc \ + STRESS_LIB_SOURCES = \ db_stress_tool/batched_ops_stress.cc \ db_stress_tool/cf_consistency_stress.cc \ db_stress_tool/db_stress_common.cc \ + db_stress_tool/db_stress_compaction_service.cc \ + db_stress_tool/db_stress_compression_manager.cc \ db_stress_tool/db_stress_driver.cc \ db_stress_tool/db_stress_filters.cc \ db_stress_tool/db_stress_gflags.cc \ db_stress_tool/db_stress_listener.cc \ db_stress_tool/db_stress_shared_state.cc \ - db_stress_tool/db_stress_stat.cc \ db_stress_tool/db_stress_test_base.cc \ db_stress_tool/db_stress_tool.cc \ db_stress_tool/db_stress_wide_merge_operator.cc \ @@ -481,11 +490,13 @@ TEST_MAIN_SOURCES = \ db/db_basic_test.cc \ db/db_block_cache_test.cc \ db/db_bloom_filter_test.cc \ + db/db_compaction_abort_test.cc \ db/db_compaction_filter_test.cc \ db/db_compaction_test.cc \ db/db_clip_test.cc \ db/db_dynamic_level_test.cc \ db/db_encryption_test.cc \ + db/db_etc3_test.cc \ db/db_flush_test.cc \ db/db_follower_test.cc \ db/db_readonly_with_timestamp_test.cc \ @@ -591,6 +602,7 @@ TEST_MAIN_SOURCES = \ table/table_test.cc \ table/block_fetcher_test.cc \ test_util/testutil_test.cc \ + util/compression_test.cc \ tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc \ tools/io_tracer_parser_test.cc \ tools/ldb_cmd_test.cc \ @@ -609,6 +621,8 @@ TEST_MAIN_SOURCES = \ util/file_reader_writer_test.cc \ util/hash_test.cc \ util/heap_test.cc \ + util/interval_test.cc \ + util/io_dispatcher_test.cc \ util/random_test.cc \ util/rate_limiter_test.cc \ util/repeatable_thread_test.cc \ @@ -645,7 +659,9 @@ TEST_MAIN_SOURCES = \ utilities/transactions/lock/range/range_locking_test.cc \ utilities/transactions/transaction_test.cc \ utilities/transactions/lock/point/point_lock_manager_test.cc \ + utilities/transactions/lock/point/point_lock_manager_stress_test.cc \ utilities/transactions/write_prepared_transaction_test.cc \ + utilities/transactions/write_prepared_transaction_test_seqno.cc \ utilities/transactions/write_unprepared_transaction_test.cc \ utilities/transactions/write_committed_transaction_ts_test.cc \ utilities/transactions/timestamped_snapshot_test.cc \ diff --git a/table/adaptive/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc index f06b265328f8..db3f7625a710 100644 --- a/table/adaptive/adaptive_table_factory.cc +++ b/table/adaptive/adaptive_table_factory.cc @@ -51,8 +51,7 @@ Status AdaptiveTableFactory::NewTableReader( footer.table_magic_number() == kLegacyPlainTableMagicNumber) { return plain_table_factory_->NewTableReader( table_reader_options, std::move(file), file_size, table); - } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || - footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { + } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber) { return block_based_table_factory_->NewTableReader( ro, table_reader_options, std::move(file), file_size, table, prefetch_index_and_filter_in_cache); diff --git a/table/block_based/binary_search_index_reader.cc b/table/block_based/binary_search_index_reader.cc index abe09d86fb3a..940bb261db23 100644 --- a/table/block_based/binary_search_index_reader.cc +++ b/table/block_based/binary_search_index_reader.cc @@ -63,7 +63,8 @@ InternalIteratorBase* BinarySearchIndexReader::NewIterator( internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true, index_has_first_key(), index_key_includes_seq(), index_value_is_full(), - false /* block_contents_pinned */, user_defined_timestamps_persisted()); + false /* block_contents_pinned */, user_defined_timestamps_persisted(), + nullptr /* prefix_index */, rep->table_options.index_block_search_type); assert(it != nullptr); index_block.TransferTo(it); diff --git a/table/block_based/block.cc b/table/block_based/block.cc index ea4d559a2a40..fe316a37be72 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -24,6 +24,7 @@ #include "table/block_based/data_block_footer.h" #include "table/format.h" #include "util/coding.h" +#include "util/math.h" namespace ROCKSDB_NAMESPACE { @@ -152,6 +153,39 @@ struct DecodeEntryV4 { } }; +// Read first 8 bytes (starting at offset) as big-endian uint64_t, padding +// with zeros on the right if the key is shorter. This preserves +// lexicographic ordering. +// +// If s.size() >= offset, then returns 0. +static uint64_t ReadBe64FromKey(Slice s, bool is_user_key, size_t offset) { + if (!is_user_key) { + assert(s.size() >= kNumInternalBytes); + s = Slice(s.data(), s.size() - kNumInternalBytes); + } + offset = std::min(offset, s.size()); + size_t remaining = s.size() - offset; + + // fast path + if (remaining >= 8) { + uint64_t val; + memcpy(&val, s.data() + offset, sizeof(val)); + if (port::kLittleEndian) { + return EndianSwapValue(val); + } + return val; + } + + uint64_t val = 0; + for (size_t i = 0; i < remaining; i++) { + val = (val << 8) | static_cast(s.data()[offset + i]); + } + if (remaining > 0) { + val <<= (8 - remaining) * 8; // Pad zeros on the right + } + return val; +} + void DataBlockIter::NextImpl() { #ifndef NDEBUG if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) { @@ -307,7 +341,8 @@ void DataBlockIter::SeekImpl(const Slice& target) { } uint32_t index = 0; bool skip_linear_scan = false; - bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + bool ok = BinarySeekRestartPointIndex(seek_key, &index, + &skip_linear_scan); if (!ok) { return; @@ -323,7 +358,8 @@ void MetaBlockIter::SeekImpl(const Slice& target) { } uint32_t index = 0; bool skip_linear_scan = false; - bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + bool ok = BinarySeekRestartPointIndex(seek_key, &index, + &skip_linear_scan); if (!ok) { return; @@ -440,8 +476,8 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { return true; } - if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), - target_user_key) != 0) { + if (icmp_.user_comparator()->Compare(raw_key_.GetUserKey(), + target_user_key) != 0) { // the key is not in this block and cannot be at the next block either. return false; } @@ -494,10 +530,14 @@ void IndexBlockIter::SeekImpl(const Slice& target) { // restart interval must be one when hash search is enabled so the binary // search simply lands at the right place. skip_linear_scan = true; - } else if (value_delta_encoded_) { - ok = BinarySeek(seek_key, &index, &skip_linear_scan); } else { - ok = BinarySeek(seek_key, &index, &skip_linear_scan); + if (value_delta_encoded_) { + ok = FindRestartPointForSeek(seek_key, &index, + &skip_linear_scan); + } else { + ok = FindRestartPointForSeek(seek_key, &index, + &skip_linear_scan); + } } if (!ok) { @@ -506,6 +546,18 @@ void IndexBlockIter::SeekImpl(const Slice& target) { FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); } +template +bool IndexBlockIter::FindRestartPointForSeek(const Slice& seek_key, + uint32_t* index, + bool* skip_linear_scan) { + if (index_search_type_ == BlockBasedTableOptions::kBinary) { + return BinarySeekRestartPointIndex(seek_key, index, + skip_linear_scan); + } + return InterpolationSeekRestartPointIndex(seek_key, index, + skip_linear_scan); +} + void DataBlockIter::SeekForPrevImpl(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); Slice seek_key = target; @@ -514,7 +566,8 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) { } uint32_t index = 0; bool skip_linear_scan = false; - bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + bool ok = BinarySeekRestartPointIndex(seek_key, &index, + &skip_linear_scan); if (!ok) { return; @@ -540,7 +593,8 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) { } uint32_t index = 0; bool skip_linear_scan = false; - bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + bool ok = BinarySeekRestartPointIndex(seek_key, &index, + &skip_linear_scan); if (!ok) { return; @@ -816,9 +870,27 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, } } -// Binary searches in restart array to find the starting restart point for the -// linear scan, and stores it in `*index`. Assumes restart array does not -// contain duplicate keys. It is guaranteed that the restart key at `*index + 1` +// Get the key slice at a given restart point index. +template +template +bool BlockIter::GetRestartKey(uint32_t index, Slice* key) { + uint32_t region_offset = GetRestartPoint(index); + uint32_t shared, non_shared; + const char* key_ptr = DecodeKeyFunc()( + data_ + region_offset, data_ + restarts_, &shared, &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return false; + } + *key = Slice(key_ptr, non_shared); + return true; +} + +// Searches in restart array using binary search to find the starting restart +// point for the linear scan, and stores it in `*index`. Assumes restart array +// does not contain duplicate keys. +// +// It is guaranteed that the restart key at `*index + 1` // is strictly greater than `target` or does not exist (this can be used to // elide a comparison when linear scan reaches all the way to the next restart // key). Furthermore, `*skip_linear_scan` is set to indicate whether the @@ -826,15 +898,15 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, // compared again later. template template -bool BlockIter::BinarySeek(const Slice& target, uint32_t* index, - bool* skip_linear_scan) { +bool BlockIter::BinarySeekRestartPointIndex(const Slice& target, + uint32_t* index, + bool* skip_linear_scan) { if (restarts_ == 0) { // SST files dedicated to range tombstones are written with index blocks // that have no keys while also having `num_restarts_ == 1`. This would - // cause a problem for `BinarySeek()` as it'd try to access the first key - // which does not exist. We identify such blocks by the offset at which - // their restarts are stored, and return false to prevent any attempted - // key accesses. + // cause a problem as we'd try to access the first key which does not exist. + // We identify such blocks by the offset at which their restarts are stored, + // and return false to prevent any attempted key accesses. return false; } @@ -842,23 +914,25 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t* index, // Loop invariants: // - Restart key at index `left` is less than or equal to the target key. The // sentinel index `-1` is considered to have a key that is less than all - // keys. + // keys. Doing this allows us to avoid a bounds check on left. // - Any restart keys after index `right` are strictly greater than the target // key. - int64_t left = -1, right = num_restarts_ - 1; + int64_t left = -1; + int64_t right = num_restarts_ - 1; + while (left != right) { // The `mid` is computed by rounding up so it lands in (`left`, `right`]. int64_t mid = left + (right - left + 1) / 2; - uint32_t region_offset = GetRestartPoint(static_cast(mid)); - uint32_t shared, non_shared; - const char* key_ptr = DecodeKeyFunc()( - data_ + region_offset, data_ + restarts_, &shared, &non_shared); - if (key_ptr == nullptr || (shared != 0)) { - CorruptionError(); + + assert(left < mid && mid <= right); + + Slice mid_key; + if (!GetRestartKey(static_cast(mid), &mid_key)) { return false; } - Slice mid_key(key_ptr, non_shared); + UpdateRawKeyAndMaybePadMinTimestamp(mid_key); + int cmp = CompareCurrentKey(target); if (cmp < 0) { // Key at "mid" is smaller than "target". Therefore all @@ -885,22 +959,317 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t* index, return true; } +// Similar effects to BinarySeekRestartPointIndex, except it uses a different +// algorithm to search for the restart point index (i.e. interpolation search). +// Interpolation search is typically more efficient for uniformly distributed +// datasets. +// +// Typically, interpolation search requires an integer "value". But because we +// are searching through variable length binary slices, we must estimate an +// integer value for each key. Currently, the value is set to be the first 8 +// bytes (read big-endian) that do not share a prefix with the start and end +// key. As a side effect, this can really only be used with the +// BytewiseComparator(). +template +template +bool BlockIter::InterpolationSeekRestartPointIndex( + const Slice& target, uint32_t* index, bool* skip_linear_scan) { + static constexpr int64_t kGuardLen = 8; + static constexpr uint64_t kMaxPoorSearches = 8; + + if (restarts_ == 0) { + return false; + } + + *skip_linear_scan = false; + // Currently it is assumed that comparator is always bytewise comparator, but + // it may also be useful to to generalize to reverse bytewise in the future. + assert(icmp_.user_comparator() == BytewiseComparator()); + + int64_t left = -1; + int64_t right = num_restarts_ - 1; + size_t shared_user_prefix_len = 0; + + Slice left_key; + Slice right_key; + Slice left_key_suffix; + Slice right_key_suffix; + Slice target_suffix = target; + bool seek_failed = false; + bool first_iter = true; + uint64_t left_val = 0; + uint64_t right_val = 0; + uint64_t target_val = 0; + + // A poor search is when less than half the search space is reduced, because + // binary search would do better. When there are kMaxPoorSearches in a row, + // then fallback to binary search. This helps bound worse cast performance. + uint64_t continuous_poor_searches = 0; + + // Loop invariants while not first iteration AND seek has not failed: + // - arr[usable_left] = left_key, arr[right] = right_key + // - left < mid <= right, and arr[left] < target < arr[right + 1] + // + // The first iteration is used as an early optimization to determine initial + // bounds, and whether target is within those bounds. + const bool is_user_key = raw_key_.IsUserKey(); + const Slice target_user_key = is_user_key ? target : ExtractUserKey(target); + while (left != right) { + int64_t mid = 0; + + // If either search window is small or we've bad numerous bad guesses, then + // fallback to binary search + seek_failed = (right - left <= kGuardLen) || + continuous_poor_searches >= kMaxPoorSearches; + + if (!seek_failed) { + // Interpolation seek reads left and right boundaries anyways, so we can + // set left = 0. The invariant that left <= target is still held because + // we early exit if left > target for the first iteration. + const uint32_t usable_left = + static_cast(std::max(left, 0)); + + // First iteration: decode both boundary keys and compute shared prefix. + if (first_iter) { + if (!GetRestartKey(usable_left, &left_key)) { + return false; + } + + if (!GetRestartKey(static_cast(right), + &right_key)) { + return false; + } + + // Compute the shared prefix length between the user key portions of + // the boundary keys. This is used to "normalize" the values calculated + // during interpolation search. + shared_user_prefix_len = left_key.difference_offset(right_key); + if (!is_user_key) { + // Ensure shared_user_prefix_len is only limited to user key. Suppose + // that the shared prefix of both keys are extended into the internal + // footer. If they are not the same user keys, then it is guaranteed + // left is the shorter one due to bytewise comparator. For reverse + // bytewise, this would be flipped. + shared_user_prefix_len = std::min( + shared_user_prefix_len, left_key.size() - kNumInternalBytes); + assert(shared_user_prefix_len <= + right_key.size() - kNumInternalBytes); + } + + left_val = + ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len); + right_val = + ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len); + target_val = + ReadBe64FromKey(target, is_user_key, shared_user_prefix_len); + } + + assert(shared_user_prefix_len <= left_key.size() && + shared_user_prefix_len <= right_key.size()); + + if (first_iter && shared_user_prefix_len > 0) { + // It is not guaranteed that the shared_prefix of the left and right + // boundaries is a valid prefix of the target. If it is not, then we can + // early exit. + size_t cmp_len = + std::min(target_user_key.size(), shared_user_prefix_len); + int cmp = memcmp(target_user_key.data(), left_key.data(), cmp_len); + if (cmp < 0 || (cmp == 0 && cmp_len < shared_user_prefix_len)) { +#ifndef NDEBUG + IterKey tmp_key; + tmp_key.SetIsUserKey(is_user_key); + UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key); + assert(CompareKey(tmp_key, target) >= 0); +#endif + // if target size is less than shared_prefix length, and cmp == 0, + // then it is guaranteed <= left + *skip_linear_scan = true; + *index = usable_left; + return true; + } else if (cmp > 0) { +#ifndef NDEBUG + IterKey tmp_key; + tmp_key.SetIsUserKey(is_user_key); + UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key); + assert(CompareKey(tmp_key, target) < 0); +#endif + *index = static_cast(right); + return true; + } + } + + assert(shared_user_prefix_len <= target_user_key.size()); + assert(memcmp(left_key.data(), target_user_key.data(), + shared_user_prefix_len) == 0); + assert(memcmp(right_key.data(), target_user_key.data(), + shared_user_prefix_len) == 0); + + if (first_iter) { + left_key_suffix = Slice(left_key.data() + shared_user_prefix_len, + left_key.size() - shared_user_prefix_len); + right_key_suffix = Slice(right_key.data() + shared_user_prefix_len, + right_key.size() - shared_user_prefix_len); + target_suffix = Slice(target.data() + shared_user_prefix_len, + target.size() - shared_user_prefix_len); + } + + if (left_val > right_val) { + CorruptionError("left key is greater than right key"); + return false; + } + + bool lte_left = false; + bool gt_right = false; + + if (target_val < left_val) { + assert(first_iter); + assert(CompareKey(left_key_suffix, target_suffix) > 0); + lte_left = true; + } else if (target_val == left_val) { + // target_val == left_val doesn't imply target == left_key + // because ReadBe64FromKey only reads 8 bytes and skips sequence + // numbers. We need to check actual key order. + if (CompareKey(left_key_suffix, target_suffix) >= 0) { + assert(first_iter); + lte_left = true; + } + } + + if (!lte_left && !seek_failed) { + if (target_val > right_val) { + // note that we only ever guarantee arr[target] < arr[right + 1], so + // it is possible to end up here even on non-first iteration + assert(CompareKey(right_key_suffix, target_suffix) < 0); + gt_right = true; + } else if (right_val == left_val) { + // cannot divide by 0 + seek_failed = true; + } + } + + // early exit if key is not within bounds + if (lte_left) { +#ifndef NDEBUG + assert(!seek_failed); + IterKey tmp_key; + tmp_key.SetIsUserKey(is_user_key); + UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key); + assert(CompareKey(tmp_key, target) >= 0); +#endif + *skip_linear_scan = true; + *index = usable_left; + return true; + } + if (gt_right) { +#ifndef NDEBUG + assert(!seek_failed); + IterKey tmp_key; + tmp_key.SetIsUserKey(is_user_key); + UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key); + assert(CompareKey(tmp_key, target) < 0); +#endif + *index = static_cast(right); + return true; + } + + if (!seek_failed) { +#ifdef HAVE_UINT128_EXTENSION + __uint128_t range = right - usable_left; + __uint128_t target_delta = target_val - left_val; + uint64_t range_delta = right_val - left_val; + int64_t offset = + static_cast(range * target_delta / range_delta); +#else + double ratio = static_cast(target_val - left_val) / + static_cast(right_val - left_val); + assert(0 <= ratio && ratio <= 1); + int64_t range = right - usable_left; + int64_t offset = static_cast(range * ratio); +#endif + left = usable_left; // can reduce search space by 1 + mid = usable_left + offset; + assert(mid <= right); + if (mid == usable_left) { + // this is to guarantee progress and avoid infinite loop + ++mid; + } + } + } + + if (seek_failed) { + // Fallback to binary seek + mid = left + (right - left + 1) / 2; + } + + assert(left < mid && mid <= right); + + Slice mid_key; + if (!GetRestartKey(static_cast(mid), &mid_key)) { + return false; + } + + Slice mid_key_suffix(mid_key.data() + shared_user_prefix_len, + mid_key.size() - shared_user_prefix_len); + + UpdateRawKeyAndMaybePadMinTimestamp(mid_key_suffix); + int cmp = CompareCurrentKey(target_suffix); + + int64_t previous_search_space = right - left; + if (cmp < 0) { + left = mid; + left_key = mid_key; + left_key_suffix = mid_key_suffix; + left_val = ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len); + } else if (cmp > 0) { + right = mid - 1; + if (!seek_failed && left != right) { + if (!GetRestartKey(static_cast(right), + &right_key)) { + return false; + } + right_key_suffix = Slice(right_key.data() + shared_user_prefix_len, + right_key.size() - shared_user_prefix_len); + right_val = + ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len); + } + } else { + *skip_linear_scan = true; + left = right = mid; + } + + // If seach space is not reduced by at least half, good chance this data is + // not uniform. + int64_t new_search_space = right - left; + if (new_search_space > previous_search_space / 2) { + ++continuous_poor_searches; + } else { + continuous_poor_searches = 0; + } + + first_iter = false; + } + + if (left == -1) { + // All keys in the block were strictly greater than `target`. So the very + // first key in the block is the final seek result. + *skip_linear_scan = true; + *index = 0; + } else { + *index = static_cast(left); + } + return true; +} + // Compare target key and the block key of the block of `block_index`. // Return -1 if error. int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { - uint32_t region_offset = GetRestartPoint(block_index); - uint32_t shared, non_shared; - const char* key_ptr = - value_delta_encoded_ - ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, - &non_shared) - : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, - &non_shared); - if (key_ptr == nullptr || (shared != 0)) { - CorruptionError(); + Slice block_key; + bool ok = value_delta_encoded_ + ? GetRestartKey(block_index, &block_key) + : GetRestartKey(block_index, &block_key); + if (!ok) { return 1; // Return target is smaller } - Slice block_key(key_ptr, non_shared); UpdateRawKeyAndMaybePadMinTimestamp(block_key); return CompareCurrentKey(target); } @@ -1015,39 +1384,12 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index, } } -uint32_t Block::NumRestarts() const { - assert(size_ >= 2 * sizeof(uint32_t)); - uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); - uint32_t num_restarts = block_footer; - if (size_ > kMaxBlockSizeSupportedByHashIndex) { - // In BlockBuilder, we have ensured a block with HashIndex is less than - // kMaxBlockSizeSupportedByHashIndex (64KiB). - // - // Therefore, if we encounter a block with a size > 64KiB, the block - // cannot have HashIndex. So the footer will directly interpreted as - // num_restarts. - // - // Such check is for backward compatibility. We can ensure legacy block - // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted - // correctly as no HashIndex even if the MSB of num_restarts is set. - return num_restarts; - } - BlockBasedTableOptions::DataBlockIndexType index_type; - UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); - return num_restarts; -} - BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { - assert(size_ >= 2 * sizeof(uint32_t)); - if (size_ > kMaxBlockSizeSupportedByHashIndex) { - // The check is for the same reason as that in NumRestarts() - return BlockBasedTableOptions::kDataBlockBinarySearch; - } - uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); - uint32_t num_restarts = block_footer; - BlockBasedTableOptions::DataBlockIndexType index_type; - UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); - return index_type; + assert(size() >= DataBlockFooter::kMinEncodedLength); + Slice input(data(), size()); + DataBlockFooter footer; + footer.DecodeFrom(&input).PermitUncheckedError(); + return footer.index_type; } Block::~Block() { @@ -1057,56 +1399,73 @@ Block::~Block() { delete[] kv_checksum_; } +Status Block::GetCorruptionStatus() const { + // Re-process the footer to get a detailed error status. + // This should only be called when size() == 0 (error marker). + assert(size() == 0); + // When size() == 0 and restart_offset_ != 0, restart_offset_ stores the + // original data size for re-decoding the footer to get detailed error. + if (restart_offset_ == 0) { + return Status::Corruption("bad block contents"); + } + Slice input(contents_.data.data(), restart_offset_); + DataBlockFooter footer; + Status s = footer.DecodeFrom(&input); + if (!s.ok()) { + return s; // Return the detailed error from DecodeFrom + } + // Footer decoded OK, so error was in later processing (shouldn't happen) + DEBUG_FAIL("ok status on presumed bad block contents"); + return Status::Corruption("presumed bad block contents"); +} + Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, Statistics* statistics) - : contents_(std::move(contents)), - data_(contents_.data.data()), - size_(contents_.data.size()), - restart_offset_(0), - num_restarts_(0) { + : contents_(std::move(contents)), restart_offset_(0), num_restarts_(0) { TEST_SYNC_POINT("Block::Block:0"); - if (size_ < sizeof(uint32_t)) { - size_ = 0; // Error marker + auto& size = contents_.data.size_; + // `contents` is assumed to be uncompressed in the proper format + Slice input(contents_.data.data(), size); + DataBlockFooter footer; + Status s = footer.DecodeFrom(&input); + if (!s.ok()) { + // Save original size for GetCorruptionStatus() to re-decode footer + restart_offset_ = static_cast(size); + size = 0; // Error marker } else { - // Should only decode restart points for uncompressed blocks - num_restarts_ = NumRestarts(); - switch (IndexType()) { + // After DecodeFrom, input has the footer removed. Each case below + // may strip additional suffix (e.g., hash index) so that input ends + // with just the restart array. + num_restarts_ = footer.num_restarts; + switch (footer.index_type) { case BlockBasedTableOptions::kDataBlockBinarySearch: - restart_offset_ = static_cast(size_) - - (1 + num_restarts_) * sizeof(uint32_t); - if (restart_offset_ > size_ - sizeof(uint32_t)) { - // The size is too small for NumRestarts() and therefore - // restart_offset_ wrapped around. - size_ = 0; - } break; case BlockBasedTableOptions::kDataBlockBinaryAndHash: - if (size_ < sizeof(uint32_t) /* block footer */ + - sizeof(uint16_t) /* NUM_BUCK */) { - size_ = 0; + if (input.size() < sizeof(uint16_t) /* NUM_BUCK */) { + size = 0; break; } - uint16_t map_offset; - data_block_hash_index_.Initialize( - data_, static_cast(size_ - sizeof(uint32_t)), /*chop off - NUM_RESTARTS*/ - &map_offset); - - restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); - - if (restart_offset_ > map_offset) { - // map_offset is too small for NumRestarts() and - // therefore restart_offset_ wrapped around. - size_ = 0; - break; - } + data_block_hash_index_.Initialize(contents_.data.data(), + static_cast(input.size()), + &map_offset); + // Strip the hash index, leaving just data + restarts + input.remove_suffix(input.size() - map_offset); break; default: - size_ = 0; // Error marker + size = 0; // Error marker + } + // After the switch, input should end with restarts[num_restarts_] + if (size != 0) { + if (input.size() < num_restarts_ * sizeof(uint32_t)) { + size = 0; // Block too small for the declared number of restarts + } else { + restart_offset_ = static_cast(input.size()) - + num_restarts_ * sizeof(uint32_t); + } } } - if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { + if (read_amp_bytes_per_bit != 0 && statistics && size != 0) { read_amp_bitmap_.reset(new BlockReadAmpBitmap( restart_offset_, read_amp_bytes_per_bit, statistics)); } @@ -1148,7 +1507,7 @@ void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); } if (!iter->status().ok()) { - size_ = 0; // Error marker + contents_.data.size_ = 0; // Error marker return; } protection_bytes_per_key_ = protection_bytes_per_key; @@ -1197,7 +1556,7 @@ void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); } if (!iter->status().ok()) { - size_ = 0; // Error marker + contents_.data.size_ = 0; // Error marker return; } protection_bytes_per_key_ = protection_bytes_per_key; @@ -1231,7 +1590,7 @@ void Block::InitializeMetaIndexBlockProtectionInfo( assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); } if (!iter->status().ok()) { - size_ = 0; // Error marker + contents_.data.size_ = 0; // Error marker return; } protection_bytes_per_key_ = protection_bytes_per_key; @@ -1240,14 +1599,14 @@ void Block::InitializeMetaIndexBlockProtectionInfo( MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { MetaBlockIter* iter = new MetaBlockIter(); - if (size_ < 2 * sizeof(uint32_t)) { - iter->Invalidate(Status::Corruption("bad block contents")); + if (size() < 2 * sizeof(uint32_t)) { + iter->Invalidate(GetCorruptionStatus()); return iter; } else if (num_restarts_ == 0) { // Empty block. iter->Invalidate(Status::OK()); } else { - iter->Initialize(data_, restart_offset_, num_restarts_, + iter->Initialize(data(), restart_offset_, num_restarts_, block_contents_pinned, protection_bytes_per_key_, kv_checksum_, block_restart_interval_); } @@ -1265,8 +1624,8 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, } else { ret_iter = new DataBlockIter; } - if (size_ < 2 * sizeof(uint32_t)) { - ret_iter->Invalidate(Status::Corruption("bad block contents")); + if (size() < 2 * sizeof(uint32_t)) { + ret_iter->Invalidate(GetCorruptionStatus()); return ret_iter; } if (num_restarts_ == 0) { @@ -1275,7 +1634,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, return ret_iter; } else { ret_iter->Initialize( - raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, + raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno, read_amp_bitmap_.get(), block_contents_pinned, user_defined_timestamps_persisted, data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr, @@ -1296,15 +1655,16 @@ IndexBlockIter* Block::NewIndexIterator( IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, bool have_first_key, bool key_includes_seq, bool value_is_full, bool block_contents_pinned, bool user_defined_timestamps_persisted, - BlockPrefixIndex* prefix_index) { + BlockPrefixIndex* prefix_index, + BlockBasedTableOptions::BlockSearchType index_block_search_type) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; } else { ret_iter = new IndexBlockIter; } - if (size_ < 2 * sizeof(uint32_t)) { - ret_iter->Invalidate(Status::Corruption("bad block contents")); + if (size() < 2 * sizeof(uint32_t)) { + ret_iter->Invalidate(GetCorruptionStatus()); return ret_iter; } if (num_restarts_ == 0) { @@ -1314,11 +1674,12 @@ IndexBlockIter* Block::NewIndexIterator( } else { BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; - ret_iter->Initialize( - raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, - prefix_index_ptr, have_first_key, key_includes_seq, value_is_full, - block_contents_pinned, user_defined_timestamps_persisted, - protection_bytes_per_key_, kv_checksum_, block_restart_interval_); + ret_iter->Initialize(raw_ucmp, data(), restart_offset_, num_restarts_, + global_seqno, prefix_index_ptr, have_first_key, + key_includes_seq, value_is_full, block_contents_pinned, + user_defined_timestamps_persisted, + protection_bytes_per_key_, kv_checksum_, + block_restart_interval_, index_block_search_type); } return ret_iter; diff --git a/table/block_based/block.h b/table/block_based/block.h index 2cd2918a82d7..2187ff8c1e3b 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -163,11 +163,11 @@ class Block { ~Block(); - size_t size() const { return size_; } - const char* data() const { return data_; } + size_t size() const { return contents_.data.size(); } + const char* data() const { return contents_.data.data(); } // The additional memory space taken by the block data. size_t usable_size() const { return contents_.usable_size(); } - uint32_t NumRestarts() const; + uint32_t NumRestarts() const { return num_restarts_; } bool own_bytes() const { return contents_.own_bytes(); } BlockBasedTableOptions::DataBlockIndexType IndexType() const; @@ -233,13 +233,19 @@ class Block { // It is determined by IndexType property of the table. // `user_defined_timestamps_persisted` controls whether a min timestamp is // padded while key is being parsed from the block. + // `index_block_search_type` controls which search algorithm to use when + // reading the index block. kBinary uses binary search, while + // kInterpolation uses interpolation search which can be faster + // for uniformly distributed keys. IndexBlockIter* NewIndexIterator( const Comparator* raw_ucmp, SequenceNumber global_seqno, IndexBlockIter* iter, Statistics* stats, bool total_order_seek, bool have_first_key, bool key_includes_seq, bool value_is_full, bool block_contents_pinned = false, bool user_defined_timestamps_persisted = true, - BlockPrefixIndex* prefix_index = nullptr); + BlockPrefixIndex* prefix_index = nullptr, + BlockBasedTableOptions::BlockSearchType index_block_search_type = + BlockBasedTableOptions::kBinary); // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; @@ -276,10 +282,15 @@ class Block { const char* TEST_GetKVChecksum() const { return kv_checksum_; } private: + // Returns a detailed error status by re-processing the footer. + // Should only be called when size() == 0 (error marker). + Status GetCorruptionStatus() const; + BlockContents contents_; - const char* data_; // contents_.data.data() - size_t size_; // contents_.data.size() - uint32_t restart_offset_; // Offset in data_ of restart array + // Normal state: offset in data_ of restart array. + // Error state (size()==0): original data size if footer decode failed, + // otherwise 0. Used by GetCorruptionStatus() to re-decode footer. + uint32_t restart_offset_; uint32_t num_restarts_; std::unique_ptr read_amp_bitmap_; char* kv_checksum_{nullptr}; @@ -428,7 +439,7 @@ class BlockIter : public InternalIteratorBase { Cache::Handle* cache_handle() { return cache_handle_; } protected: - std::unique_ptr icmp_; + InternalKeyComparator icmp_; const char* data_; // underlying block contents uint32_t num_restarts_; // Number of uint32_t entries in restart array @@ -530,17 +541,15 @@ class BlockIter : public InternalIteratorBase { uint32_t block_restart_interval) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid - - icmp_ = std::make_unique(raw_ucmp); + assert(raw_ucmp != nullptr); + icmp_ = InternalKeyComparator(raw_ucmp); data_ = data; restarts_ = restarts; num_restarts_ = num_restarts; current_ = restarts_; restart_index_ = num_restarts_; global_seqno_ = global_seqno; - if (raw_ucmp != nullptr) { - ts_sz_ = raw_ucmp->timestamp_size(); - } + ts_sz_ = raw_ucmp->timestamp_size(); pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted; block_contents_pinned_ = block_contents_pinned; cache_handle_ = nullptr; @@ -573,14 +582,18 @@ class BlockIter : public InternalIteratorBase { CorruptionError(error_msg); } - void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { + void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) { if (pad_min_timestamp_) { - raw_key_.SetKeyWithPaddedMinTimestamp(key, ts_sz_); + raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_); } else { - raw_key_.SetKey(key, false /* copy */); + raw_key.SetKey(key, false /* copy */); } } + void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { + UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key); + } + // Must be called every time a key is found that needs to be returned to user, // and may be called when no key is found (as a no-op). Updates `key_`, // `key_buf_`, and `key_pinned_` with info about the found key. @@ -620,18 +633,31 @@ class BlockIter : public InternalIteratorBase { } } - // Returns the result of `Comparator::Compare()`, where the appropriate - // comparator is used for the block contents, the LHS argument is the current - // key with global seqno applied, and the RHS argument is `other`. - int CompareCurrentKey(const Slice& other) { + // Compares two keys using the appropriate comparator for the block contents. + // Uses user comparator when the block stores user keys, otherwise uses the + // internal key comparator. When global_seqno is not disabled, applies it to + // the LHS key for comparison. + int CompareKey(const Slice& a, const Slice& b) const { + assert(icmp_.user_comparator() != nullptr); if (raw_key_.IsUserKey()) { assert(global_seqno_ == kDisableGlobalSequenceNumber); - return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other); + return icmp_.user_comparator()->Compare(a, b); } else if (global_seqno_ == kDisableGlobalSequenceNumber) { - return icmp_->Compare(raw_key_.GetInternalKey(), other); + return icmp_.Compare(a, b); + } + return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber); + } + + int CompareKey(const IterKey& a, const Slice& b) const { + if (a.IsUserKey()) { + return CompareKey(a.GetUserKey(), b); } - return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other, - kDisableGlobalSequenceNumber); + return CompareKey(a.GetInternalKey(), b); + } + + // Compares the current key (with global seqno applied) against `other`. + int CompareCurrentKey(const Slice& other) const { + return CompareKey(raw_key_, other); } private: @@ -666,8 +692,16 @@ class BlockIter : public InternalIteratorBase { protected: template - inline bool BinarySeek(const Slice& target, uint32_t* index, - bool* is_index_key_result); + inline bool GetRestartKey(uint32_t index, Slice* key); + + template + inline bool BinarySeekRestartPointIndex(const Slice& target, uint32_t* index, + bool* is_index_key_result); + + template + inline bool InterpolationSeekRestartPointIndex(const Slice& target, + uint32_t* index, + bool* is_index_key_result); // Find the first key in restart interval `index` that is >= `target`. // If there is no such key, iterator is positioned at the first key in @@ -831,14 +865,14 @@ class IndexBlockIter final : public BlockIter { // format. // value_is_full, default true, means that no delta encoding is // applied to values. - void Initialize(const Comparator* raw_ucmp, const char* data, - uint32_t restarts, uint32_t num_restarts, - SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, - bool have_first_key, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned, - bool user_defined_timestamps_persisted, - uint8_t protection_bytes_per_key, const char* kv_checksum, - uint32_t block_restart_interval) { + void Initialize( + const Comparator* raw_ucmp, const char* data, uint32_t restarts, + uint32_t num_restarts, SequenceNumber global_seqno, + BlockPrefixIndex* prefix_index, bool have_first_key, + bool key_includes_seq, bool value_is_full, bool block_contents_pinned, + bool user_defined_timestamps_persisted, uint8_t protection_bytes_per_key, + const char* kv_checksum, uint32_t block_restart_interval, + BlockBasedTableOptions::BlockSearchType index_block_search_type) { InitializeBase(raw_ucmp, data, restarts, num_restarts, kDisableGlobalSequenceNumber, block_contents_pinned, user_defined_timestamps_persisted, protection_bytes_per_key, @@ -847,6 +881,7 @@ class IndexBlockIter final : public BlockIter { prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; have_first_key_ = have_first_key; + index_search_type_ = index_block_search_type; if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); } else { @@ -941,6 +976,10 @@ class IndexBlockIter final : public BlockIter { // `pad_min_timestamp_` is true. std::string first_internal_key_with_ts_; + // The search algorithm to use when reading the index block. + BlockBasedTableOptions::BlockSearchType index_search_type_ = + BlockBasedTableOptions::kBinary; + // Set *prefix_may_exist to false if no key possibly share the same prefix // as `target`. If not set, the result position should be the same as total // order Seek. @@ -953,6 +992,10 @@ class IndexBlockIter final : public BlockIter { bool* prefix_may_exist); inline int CompareBlockKey(uint32_t block_index, const Slice& target); + template + bool FindRestartPointForSeek(const Slice& seek_key, uint32_t* index, + bool* skip_linear_scan); + inline bool ParseNextIndexKey(); // When value_delta_encoded_ is enabled it decodes the value which is assumed diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 3fb7b2dbdaf4..c080dcb5cca1 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -46,14 +46,17 @@ #include "table/block_based/filter_policy_internal.h" #include "table/block_based/full_filter_block.h" #include "table/block_based/partitioned_filter_block.h" +#include "table/block_based/user_defined_index_wrapper.h" #include "table/format.h" #include "table/meta_blocks.h" #include "table/table_builder.h" +#include "util/bit_fields.h" #include "util/coding.h" #include "util/compression.h" +#include "util/defer.h" +#include "util/semaphore.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/work_queue.h" namespace ROCKSDB_NAMESPACE { @@ -107,90 +110,20 @@ FilterBlockBuilder* CreateFilterBlockBuilder( } } -bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size, - int max_compressed_bytes_per_kb) { - // For efficiency, avoid floating point and division - return compressed_size <= - (static_cast(max_compressed_bytes_per_kb) * uncomp_size) >> - 10; -} - -} // namespace - -// format_version is the block format as defined in include/rocksdb/table.h -Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, - CompressionType* type, uint32_t format_version, - bool allow_sample, std::string* compressed_output, - std::string* sampled_output_fast, - std::string* sampled_output_slow) { - assert(type); - assert(compressed_output); - assert(compressed_output->empty()); - - // If requested, we sample one in every N block with a - // fast and slow compression algorithm and report the stats. - // The users can use these stats to decide if it is worthwhile - // enabling compression and they also get a hint about which - // compression algorithm wil be beneficial. - if (allow_sample && info.SampleForCompression() && - Random::GetTLSInstance()->OneIn( - static_cast(info.SampleForCompression()))) { - // Sampling with a fast compression algorithm - if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) { - CompressionType c = - LZ4_Supported() ? kLZ4Compression : kSnappyCompression; - CompressionOptions options; - CompressionContext context(c, options); - CompressionInfo info_tmp(options, context, - CompressionDict::GetEmptyDict(), c, - info.SampleForCompression()); - - CompressData(uncompressed_data, info_tmp, - GetCompressFormatForVersion(format_version), - sampled_output_fast); - } - - // Sampling with a slow but high-compression algorithm - if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) { - CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression; - CompressionOptions options; - CompressionContext context(c, options); - CompressionInfo info_tmp(options, context, - CompressionDict::GetEmptyDict(), c, - info.SampleForCompression()); - - CompressData(uncompressed_data, info_tmp, - GetCompressFormatForVersion(format_version), - sampled_output_slow); - } - } - - int max_compressed_bytes_per_kb = info.options().max_compressed_bytes_per_kb; - if (info.type() == kNoCompression || max_compressed_bytes_per_kb <= 0) { - *type = kNoCompression; - return uncompressed_data; - } - - // Actually compress the data; if the compression method is not supported, - // or the compression fails etc., just fall back to uncompressed - if (!CompressData(uncompressed_data, info, - GetCompressFormatForVersion(format_version), - compressed_output)) { - *type = kNoCompression; - return uncompressed_data; - } - - // Check the compression ratio; if it's not good enough, just fall back to - // uncompressed - if (!GoodCompressionRatio(compressed_output->size(), uncompressed_data.size(), - max_compressed_bytes_per_kb)) { - *type = kNoCompression; - return uncompressed_data; +// A convenience function for populating the Compressor* fields; see ~Rep() +Compressor* MaybeCloneSpecialized( + Compressor* compressor, CacheEntryRole block_type, + Compressor::DictConfigArgs&& dict_config = Compressor::DictDisabled{}) { + auto specialized = + compressor->MaybeCloneSpecialized(block_type, std::move(dict_config)); + if (specialized) { + // Caller is responsible for freeing when distinct + return specialized.release(); + } else { + return compressor; } - - *type = info.type(); - return *compressed_output; } +} // namespace // kBlockBasedTableMagicNumber was picked by running // echo rocksdb.table.block_based | sha1sum @@ -201,9 +134,6 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, // allocated // it must be not extern in one place. const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull; -// We also support reading and writing legacy block based table format (for -// backwards compatibility) -const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull; // A collector that collects properties of interest to block-based table. // For now this class looks heavy-weight since we only write one additional @@ -268,6 +198,587 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector bool decoupled_partitioned_filters_; }; +struct BlockBasedTableBuilder::WorkingAreaPair { + Compressor::ManagedWorkingArea compress; + Decompressor::ManagedWorkingArea verify; +}; + +// ParallelCompressionRep essentially defines a framework for parallelizing +// block generation ("emit"), block compression, and block writing to storage. +// The synchronization is lock-free/wait-free, so thread waiting only happens +// when work-order dependencies are unsatisfied, though sleeping/idle threads +// might be kept idle when it seems unlikely they would improve throughput by +// waking them up (essentially auto-tuned parallelism). But because all threads +// are capable of 2 out of 3 kinds of work, in a quasi-work-stealing system, +// running threads can usually expect that compatible work is available. +// +// This is currently activated with CompressionOptions::parallel_threads > 1 +// but that is a somewhat crude API that would ideally be adapted along with +// the implementation in the future to allow threads to serve multiple +// flush/compaction jobs, though the available improvement might be small. +// Even within the scope of a single file it might be nice to use a general +// framework for distributing work across threads, but (a) different threads +// are limited to which work they can do because of technical challenges, (b) +// being largely CPU bound on small work units means such a framework would +// likely have big overheads compared to this hand-optimized solution. +struct BlockBasedTableBuilder::ParallelCompressionRep { + // The framework has two kinds of threads: the calling thread from + // flush/compaction/SstFileWriter is called the "emit thread" (kEmitter). + // Other threads cannot generally take over "emit" work because that is + // largely happening up the call stack from BlockBasedTableBuilder. + // The emit thread can also take on compression work in a quasi-work-stealing + // manner when the buffer for emitting new blocks is full. + // + // When parallelism is enabled, there are also "worker" threads that + // can handle compressing blocks and (one worker thread at a time) write them + // to the SST file (and handle other single-threaded wrap-up of each block). + // + // NOTE: when parallelism is enabled, the emit thread is not permitted to + // write to the SST file because that is the potential "output" bottleneck, + // and it's generally bad for parallelism to allow the only thread that can + // serve the "input" bottleneck (emit work) to also spend exclusive time on + // the output bottleneck. + enum class ThreadKind { + kEmitter, + kWorker, + }; + + // ThreadState allows each thread to track its work assignment. In addition to + // the cases already mentioned, kEmitting, kCompressing, and kWriting to the + // SST file writer, + // * Threads can enter the kIdle state so that they can sleep when no work is + // available for them, to be woken up when appropriate. + // * The kEnd state means the thread is not doing any more work items, which + // for worker threads means they will end soon. + // * The kCompressingAndWriting state means a worker can compress and write a + // block without additional state updates because the same block to be + // compressed is the next to be written. + enum class ThreadState { + /* BEGIN Emitter only states */ + kEmitting, + /* END Emitter only states */ + /* BEGIN states for emitter and worker */ + kIdle, + kCompressing, + kEnd, + /* END states for emitter and worker */ + /* BEGIN Worker only states */ + kCompressingAndWriting, + kWriting, + /* END Worker only states */ + }; + + // BlockRep instances are used and reused in a ring buffer (below), so that + // many blocks can be in an intermediate state between serialized into + // uncompressed bytes and written to the SST file. Notably, each block is + // "emitted" in uncompressed form into a BlockRep, compressed (at least + // attempted, when configured) for updated BlockRep, and then written from the + // BlockRep to the writer for the SST file bytes. + struct ALIGN_AS(CACHE_LINE_SIZE) BlockRep { + // Uncompressed block contents + std::string uncompressed; + GrowableBuffer compressed; + CompressionType compression_type = kNoCompression; + std::unique_ptr prepared_index_entry; + }; + + // Ring buffer of emitted blocks that may or may not yet be compressed. + std::unique_ptr ring_buffer; + // log_2(ring buffer size), where ring buffer size must be a power of two + const int ring_buffer_nbits; + // ring buffer size - 1, to function as a bit mask for ring buffer positions + // (e.g. given the ordinal number of a block) + const uint32_t ring_buffer_mask; + // Number of threads in worker_threads. (Emit thread doesn't count) + const uint32_t num_worker_threads; + + // Rough upper bound on the sst file size contribution from blocks emitted + // into the parallel compression ring buffer but not yet written. Tracks + // uncompressed size, with trailer, until a block is compressed, then + // compressed size until the block is written. (TODO: does not currently + // account for block_align) + RelaxedAtomic estimated_inflight_size{0}; + // Thread objects for worker threads + std::vector worker_threads; + // Working areas for data_block_compressor for each worker thread + std::vector working_areas; + + // Semaphores for threads to sleep when there's no available work for them + // and to wake back up when someone determines there is available work (most + // likely). Split between worker threads and emit thread because they can do + // different kinds of work. + CountingSemaphore idle_worker_sem{0}; + BinarySemaphore idle_emit_sem{0}; + + // Primary atomic state of parallel compression, which includes a number of + // state fields that are best updated atomically to avoid locking and/or to + // simplify the interesting interleavings that have to be considered and + // accommodated. + struct State : public BitFields {}; + ALIGN_AS(CACHE_LINE_SIZE) BitFieldsAtomic atomic_state; + + // The first field is a bit for each ring buffer slot (max 32) for whether + // that slot is ready to be claimed for writing by a worker thread. Because + // compressions might finish out-of-order, we need to track individually + // whether they are finished, though this field doesn't differentiate + // "compression completed" from "compression not started" because that can be + // inferred from NextToCompress. A block might not enter this state, because + // the same thread that compresses it can also immediately write the block if + // it notices that the block is next to write. + using NeedsWriter = UnsignedBitField; + // Track how many worker threads are in an idle state because there was no + // available work and haven't been selected to wake back up. + using IdleWorkerCount = UnsignedBitField; + // Track whether the emit thread is an idle state because there was no + // available work and hasn't been triggered to wake back up. The nature of + // available work and atomic CAS assignment of work ensures at least one + // thread is kept out of the idle state. + using IdleEmitFlag = BoolBitField; + // Track whether threads should end when they finish available work because no + // more blocks will be emitted. + using NoMoreToEmitFlag = BoolBitField; + // Track whether threads should abort ASAP because of an error. + using AbortFlag = BoolBitField; + // Track three "NextTo" counters for the positions of the next block to write, + // to start compression, and to emit into the ring buffer. If these counters + // never overflowed / wrapped around, we would have next_to_write <= + // next_to_compress <= next_to_emit because a block must be emitted before + // compressed, and compressed (at least attempted) before writing. We need to + // track more than ring_buffer_nbits of these counters to be able to + // distinguish an empty ring buffer (next_to_write == next_to_emit) from a + // full ring buffer (next_to_write != next_to_emit but equal under + // ring_buffer_mask). + using NextToWrite = UnsignedBitField; + using NextToCompress = UnsignedBitField; + using NextToEmit = UnsignedBitField; + static_assert(NextToEmit::kEndBit == 64); + + // BEGIN fields for use by the emit thread only. These can't live on the stack + // because the emit thread frequently returns out of BlockBasedTableBuilder. + ALIGN_AS(CACHE_LINE_SIZE) + ThreadState emit_thread_state = ThreadState::kEmitting; + // Ring buffer index that emit thread is operating on (for emitting and + // compressing states) + uint32_t emit_slot = 0; + // Including some data to inform when to wake up idle worker threads (see + // implementation for details) + int32_t emit_counter_toward_wake_up = 0; + int32_t emit_counter_for_wake_up = 0; + static constexpr int32_t kMaxWakeupInterval = 8; + // END fields for use by the emit thread only + + // TSAN on GCC has bugs that report false positives on this watchdog code. + // Other efforts to work around the bug have failed, so to avoid those false + // positive reports, we simply disable the watchdog when running under GCC + // TSAN. +#if !defined(NDEBUG) && !(defined(__GNUC__) && defined(__SANITIZE_THREAD__)) +#define BBTB_PC_WATCHDOG 1 +#endif +#ifdef BBTB_PC_WATCHDOG + // These are for an extra "watchdog" thread in DEBUG builds that heuristically + // checks for the most likely deadlock conditions. False positives and false + // negatives are technically possible. + std::thread watchdog_thread; + std::mutex watchdog_mutex; + std::condition_variable watchdog_cv; + bool shutdown_watchdog = false; + RelaxedAtomic live_workers{0}; + RelaxedAtomic idling_workers{0}; + RelaxedAtomic live_emit{0}; + RelaxedAtomic idling_emit{0}; +#endif // BBTB_PC_WATCHDOG + + int ComputeRingBufferNbits(uint32_t parallel_threads) { + // Ring buffer size is a power of two not to exceed 32 but otherwise + // at least twice the number of threads. + if (parallel_threads >= 9) { + return 5; + } else if (parallel_threads >= 5) { + return 4; + } else if (parallel_threads >= 3) { + return 3; + } else { + assert(parallel_threads > 1); + return 2; + } + } + + explicit ParallelCompressionRep(uint32_t parallel_threads) + : ring_buffer_nbits(ComputeRingBufferNbits(parallel_threads)), + ring_buffer_mask((uint32_t{1} << ring_buffer_nbits) - 1), + num_worker_threads(std::min(parallel_threads, ring_buffer_mask)) { + assert(num_worker_threads <= IdleWorkerCount::kMask); + + ring_buffer = std::make_unique(ring_buffer_mask + 1); + + // Start by aggressively waking up idle workers + emit_counter_for_wake_up = -static_cast(num_worker_threads); + } + + ~ParallelCompressionRep() { +#ifndef NDEBUG + auto state = atomic_state.Load(); + if (state.Get() == false) { + // Should be clear / cancelled out with normal shutdown + assert(state.Get() == 0); + + // Ring buffer reached empty state + assert(state.Get() == state.Get()); + assert(state.Get() == state.Get()); + + // Everything cancels out in inflight size + assert(estimated_inflight_size.LoadRelaxed() == 0); + } + // All idling metadata cleaned up, properly tracked + assert(state.Get() == 0); + assert(state.Get() == false); + + // No excess in semaphores + assert(!idle_emit_sem.TryAcquire()); + assert(!idle_worker_sem.TryAcquire()); +#endif // !NDEBUG + } + + // The primary function for a thread transitioning from one state or work + // assignment to the next. `slot` refers to a position in the ring buffer + // for assigned emit, compression, or write work. + // + // Because both the emit thread and worker threads can work on compression, + // this is a quasi-work-stealing parallel algorithm. (Enabling other threads + // to do emit work would be quite challenging, and allowing the emit thread + // to handle writes could create a bottle-neck.) + // + // This function is basically a CAS loop trying to pick the next piece of work + // for this thread and retrying if CAS fails. This function also handles + // thread idling when that's the appropriate assignment, continuing the loop + // looking for productive work when woken from an idle state. + // + // Precondition: thread_state is appropriate for thread_kind and not kEnd. It + // must match the previously returned state for that thread, and is only kIdle + // for the thread on startup (though the kIdle state is used internal to the + // function). + // + // Postcondition: thread_state is appropriate for thread_kind and not kIdle. + // Except for kEnd state, the calling thread has exclusive access to + // ring_buffer[slot] until next StateTransition(). + template + void StateTransition( + /*in/out*/ ThreadState& thread_state, + /*in/out*/ uint32_t& slot) { + assert(slot <= ring_buffer_mask); + // Last known value for atomic_state + State seen_state = atomic_state.Load(); + + for (;;) { + if (seen_state.Get()) { + thread_state = ThreadState::kEnd; + return; + } + + assert(static_cast(seen_state.Get() - + seen_state.Get()) <= + ring_buffer_mask + 1); + assert(static_cast(seen_state.Get() - + seen_state.Get()) <= + ring_buffer_mask + 1); + assert(static_cast(seen_state.Get() - + seen_state.Get()) <= + ring_buffer_mask + 1); + + // Draft of the next proposed atomic_state. Start by marking completion of + // the current thread's last work. + State next_state = seen_state; + bool wake_idle = false; + switch (thread_state) { + case ThreadState::kEmitting: { + assert(thread_kind == ThreadKind::kEmitter); + assert(slot == (next_state.Get() & ring_buffer_mask)); + next_state.Ref() += 1; + // Check whether to wake up idle worker thread + if (next_state.Get() > 0 && + // The number of blocks for which compression hasn't started + // is well over the number of active threads. + static_cast(next_state.Get() - + next_state.Get()) >= + (ring_buffer_mask + 1) / 4 + + (num_worker_threads - + next_state.Get())) { + // At first, emit_counter_for_wake_up is negative to aggressively + // wake up idle worker threads. Then it backs off the interval at + // which we wake up, up to some maximum that attempts to balance + // maximum throughput and minimum CPU overhead. + if (emit_counter_toward_wake_up >= emit_counter_for_wake_up) { + // We reached a threshold to justify a wake-up. + wake_idle = true; + // Adjust idle count assuming we are going to own waking it up, + // so no one else can duplicate that. (The idle count is really + // the number idling for which no one yet owns waking them up.) + next_state.Ref() -= 1; + // Reset the counter toward the threshold for wake-up + emit_counter_toward_wake_up = 0; + // Raise the threshold (up to some limit) to stabilize the number + // of active threads after some ramp-up period. + emit_counter_for_wake_up = + std::min(emit_counter_for_wake_up + 1, + static_cast(num_worker_threads + + kMaxWakeupInterval)); + } else { + // Advance closer to the threshold for justifying a wake-up + emit_counter_toward_wake_up++; + } + } + break; + } + case ThreadState::kIdle: + // NOTE: thread that signalled to wake up already updated idle count + // or marker. This is required to avoid overflow on the semaphore, + // especially the binary semaphore for idle_emit_sem, and likely + // desirable to avoid spurious/extra Release(). + break; + case ThreadState::kCompressing: + next_state.Ref() |= uint32_t{1} << slot; + if constexpr (thread_kind == ThreadKind::kEmitter) { + if (next_state.Get() == num_worker_threads) { + // Work is available for a worker thread and none are running + wake_idle = true; + // Adjust idle count assuming we are going to own waking it up + next_state.Ref() -= 1; + } + } + break; + case ThreadState::kEnd: + // Should have already recognized the end state + assert(thread_state != ThreadState::kEnd); + return; + case ThreadState::kCompressingAndWriting: + case ThreadState::kWriting: + assert(thread_kind == ThreadKind::kWorker); + assert((next_state.Get() & ring_buffer_mask) == slot); + assert(next_state.Get() != + next_state.Get()); + assert(next_state.Get() != next_state.Get()); + assert((next_state.Get() & (uint32_t{1} << slot)) == 0); + next_state.Ref() += 1; + if (next_state.Get()) { + wake_idle = true; + // Clear idle emit flag assuming we are going to own waking it up + next_state.Set(false); + } + break; + } + + // Find the next state, depending on the kind of thread + ThreadState next_thread_state = ThreadState::kEnd; + uint32_t next_slot = 0; + if constexpr (thread_kind == ThreadKind::kEmitter) { + // First priority is emitting more uncompressed blocks, if there's + // room in the ring buffer. + if (static_cast(next_state.Get() - + next_state.Get()) <= + ring_buffer_mask) { + // There is room + next_thread_state = ThreadState::kEmitting; + next_slot = next_state.Get() & ring_buffer_mask; + } + } + if constexpr (thread_kind == ThreadKind::kWorker) { + // First priority is writing next block to write, if it needs a writer + // assigned to it + uint32_t next_to_write_slot = + next_state.Get() & ring_buffer_mask; + uint32_t needs_writer_bit = uint32_t{1} << next_to_write_slot; + if (next_state.Get() & needs_writer_bit) { + // Clear the "needs writer" marker on the slot + next_state.Ref() &= ~needs_writer_bit; + // Take ownership of writing it + next_thread_state = ThreadState::kWriting; + next_slot = next_to_write_slot; + } + } + + // If didn't find higher priority work + if (next_thread_state == ThreadState::kEnd) { + if (next_state.Get() != next_state.Get()) { + // Compression work is available, select that + if (thread_kind == ThreadKind::kWorker && + next_state.Get() == + next_state.Get()) { + next_thread_state = ThreadState::kCompressingAndWriting; + } else { + next_thread_state = ThreadState::kCompressing; + } + next_slot = next_state.Get() & ring_buffer_mask; + next_state.Ref() += 1; + } else if constexpr (thread_kind == ThreadKind::kEmitter) { + // Emitter thread goes idle + next_thread_state = ThreadState::kIdle; + assert(next_state.Get() == false); + assert(next_state.Get() == false); + next_state.Set(true); + } else if (next_state.Get()) { + // Worker thread shall not idle if we are done emitting. At least + // one worker will remain unblocked to finish writing + next_thread_state = ThreadState::kEnd; + } else { + // Worker thread goes idle + next_thread_state = ThreadState::kIdle; + assert(next_state.Get() < IdleWorkerCount::kMask); + next_state.Ref() += 1; + } + } + assert(thread_state != ThreadState::kEnd); + + // Attempt to atomically apply the desired/computed state transition + if (atomic_state.CasWeak(seen_state, next_state)) { + // Success + thread_state = next_thread_state; + slot = next_slot; + seen_state = next_state; + if (wake_idle) { + if constexpr (thread_kind == ThreadKind::kEmitter) { + idle_worker_sem.Release(); + } else { + idle_emit_sem.Release(); + } + } + if (thread_state != ThreadState::kIdle) { + // Successfully transitioned to another useful state + return; + } + // Handle idle state + if constexpr (thread_kind == ThreadKind::kEmitter) { +#ifdef BBTB_PC_WATCHDOG + idling_emit.StoreRelaxed(true); + Defer decr{[this]() { idling_emit.StoreRelaxed(false); }}; +#endif // BBTB_PC_WATCHDOG + + // Likely go to sleep + idle_emit_sem.Acquire(); + } else { +#ifdef BBTB_PC_WATCHDOG + // Tracking for watchdog + idling_workers.FetchAddRelaxed(1); + Defer decr{[this]() { idling_workers.FetchSubRelaxed(1); }}; +#endif // BBTB_PC_WATCHDOG + + // Likely go to sleep + idle_worker_sem.Acquire(); + } + // Update state after sleep + seen_state = atomic_state.Load(); + } + // else loop and try again + } + } + + void EmitterStateTransition( + /*in/out*/ ThreadState& thread_state, + /*in/out*/ uint32_t& slot) { + StateTransition(thread_state, slot); + } + + void WorkerStateTransition( + /*in/out*/ ThreadState& thread_state, + /*in/out*/ uint32_t& slot) { + StateTransition(thread_state, slot); + } + + // Exactly wake all idling threads (for an end state) + void WakeAllIdle() { + State old_state, new_state; + auto transform = + IdleEmitFlag::ClearTransform() + IdleWorkerCount::ClearTransform(); + atomic_state.Apply(transform, &old_state, &new_state); + assert(new_state.Get() == false); + assert(new_state.Get() == 0); + if (old_state.Get()) { + idle_emit_sem.Release(); + } + idle_worker_sem.Release(old_state.Get()); + } + + // Called by emit thread if it is decided no more blocks will be emitted into + // this SST file. + void SetNoMoreToEmit(/*in/out*/ ThreadState& thread_state, + /*in/out*/ uint32_t& slot) { + (void)slot; + State old_state; + atomic_state.Apply(NoMoreToEmitFlag::SetTransform(), &old_state); + assert(old_state.Get() == false); + assert(slot == BitwiseAnd(old_state.Get(), ring_buffer_mask)); + assert(thread_state == ThreadState::kEmitting); + thread_state = ThreadState::kEnd; + WakeAllIdle(); + } + + // Called by any thread to abort parallel compression, etc. because of an + // error. + void SetAbort(/*in/out*/ ThreadState& thread_state) { + State old_state; + atomic_state.Apply(AbortFlag::SetTransform(), &old_state); + if (old_state.Get() == false) { + // First to set abort. Wake all workers and emitter + WakeAllIdle(); + } + thread_state = ThreadState::kEnd; + } + +#ifdef BBTB_PC_WATCHDOG + // Logic for the extra "watchdog" thread in DEBUG builds that heuristically + // checks for the most likely deadlock conditions. + // + // Some ways to manually validate the watchdog: + // * Insert + // if (Random::GetTLSInstance()->OneIn(100)) { + // sleep(100); + // } + // after either of the calls to semaphore Acquire above. + // * Miss some Release()s in WakeAllIdle() + // + // and run table_test unit tests. + void BGWatchdog() { + int count_toward_deadlock_judgment = 0; + for (;;) { + // Check for termination condition: All workers and emit thread have + // completed. + if (live_workers.LoadRelaxed() == 0 && live_emit.LoadRelaxed() == false) { + return; + } + + // Check for potential deadlock condition + if (idling_workers.LoadRelaxed() < live_workers.LoadRelaxed() || + (live_emit.LoadRelaxed() && !idling_emit.LoadRelaxed())) { + // Someone is working, all good + count_toward_deadlock_judgment = 0; + } else { + // Could be a deadlock state, but could also be a transient + // state where someone has woken up but not cleared their idling flag. + // Give it plenty of time and watchdog thread wake-ups before + // declaring deadlock. + count_toward_deadlock_judgment++; + if (count_toward_deadlock_judgment >= 70) { + fprintf(stderr, + "Error: apparent deadlock in parallel compression. " + "Aborting. %u / %u, %d / %d, %llx\n", + (unsigned)idling_workers.LoadRelaxed(), + (unsigned)live_workers.LoadRelaxed(), + (int)idling_emit.LoadRelaxed(), (int)live_emit.LoadRelaxed(), + (long long)atomic_state.Load().underlying); + std::terminate(); + } + } + + // Sleep for 1s at a time unless we are woken up because other threads + // ended. + std::unique_lock lock(watchdog_mutex); + if (!shutdown_watchdog) { + watchdog_cv.wait_for(lock, std::chrono::seconds{1}); + } + } + } +#endif // BBTB_PC_WATCHDOG +}; + struct BlockBasedTableBuilder::Rep { const ImmutableOptions ioptions; // BEGIN from MutableCFOptions @@ -291,7 +802,9 @@ struct BlockBasedTableBuilder::Rep { // user key should contain the minimum timestamp. bool persist_user_defined_timestamps; WritableFileWriter* file; - std::atomic offset; + // The current offset is only written by the current designated writer thread + // but can be read by other threads to estimate current file size + RelaxedAtomic offset{0}; size_t alignment; BlockBuilder data_block; // Buffers uncompressed data blocks to replay later. Needed when @@ -306,19 +819,59 @@ struct BlockBasedTableBuilder::Rep { PartitionedIndexBuilder* p_index_builder_ = nullptr; std::string last_ikey; // Internal key or empty (unset) - const Slice* first_key_in_next_block = nullptr; - CompressionType compression_type; + bool warm_cache = false; + bool uses_explicit_compression_manager = false; + uint64_t sample_for_compression; - std::atomic compressible_input_data_bytes; - std::atomic uncompressible_input_data_bytes; - std::atomic sampled_input_data_bytes; - std::atomic sampled_output_slow_data_bytes; - std::atomic sampled_output_fast_data_bytes; - CompressionOptions compression_opts; - std::unique_ptr compression_dict; - std::vector> compression_ctxs; - std::vector> verify_ctxs; - std::unique_ptr verify_dict; + RelaxedAtomic compressible_input_data_bytes{0}; + RelaxedAtomic uncompressible_input_data_bytes{0}; + RelaxedAtomic sampled_input_data_bytes{0}; + RelaxedAtomic sampled_output_slow_data_bytes{0}; + RelaxedAtomic sampled_output_fast_data_bytes{0}; + uint32_t compression_parallel_threads; + int max_compressed_bytes_per_kb; + // Dictionary guidance for data blocks (from GetDictGuidance()) + Compressor::DictConfig data_block_dict_guidance; + + // *** Compressors & decompressors - Yes, it seems like a lot here but *** + // *** these are distinct fields to minimize extra conditionals and *** + // *** field reads on hot code paths. And to avoid interlocked *** + // *** instructions associated with shared_ptr. *** + + // A compressor for blocks in general, without dictionary compression + std::unique_ptr basic_compressor; + // Built-in compressors for compression size sampling + std::unique_ptr fast_sample_compressor; + std::unique_ptr slow_sample_compressor; + // A compressor for data blocks, which might be tuned differently and might + // use dictionary compression (when applicable). See ~Rep() for some details. + UnownedPtr data_block_compressor = nullptr; + // A compressor for index blocks, which might be tuned differently from + // basic_compressor. See ~Rep() for some details. + UnownedPtr index_block_compressor = nullptr; + // A decompressor corresponding to basic_compressor (when non-nullptr). + // Used for verification and cache warming. + std::shared_ptr basic_decompressor; + // When needed, a decompressor for verifying compression using a + // dictionary sampled/trained from this file. + std::unique_ptr verify_decompressor_with_dict; + // When non-nullptr, compression should be verified with this corresponding + // decompressor, except for data blocks. (Points to same as basic_decompressor + // when verify_compression is set.) + UnownedPtr verify_decompressor; + // Once configured/determined, points to one of the above Decompressors to use + // in verifying data blocks. + UnownedPtr data_block_verify_decompressor; + + // Set of compression types used for blocks in this file (mixing compression + // algorithms in a single file is allowed, using a CompressionManager) + SmallEnumSet + compression_types_used; + + // Working area for basic_compressor when compression_parallel_threads==1 + WorkingAreaPair index_block_working_area; + // Working area for data_block_compressor, for emit/compaction thread + WorkingAreaPair data_block_working_area; size_t data_begin_offset = 0; @@ -347,103 +900,119 @@ struct BlockBasedTableBuilder::Rep { kUnbuffered, kClosed, }; - State state; + State state = State::kUnbuffered; // `kBuffered` state is allowed only as long as the buffering of uncompressed // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`. - uint64_t buffer_limit; + uint64_t buffer_limit = 0; std::shared_ptr compression_dict_buffer_cache_res_mgr; const bool use_delta_encoding_for_index_values; std::unique_ptr filter_builder; OffsetableCacheKey base_cache_key; const TableFileCreationReason reason; + const bool target_file_size_is_upper_bound; BlockHandle pending_handle; // Handle to add to index block - std::string compressed_output; + GrowableBuffer single_threaded_compressed_output; std::unique_ptr flush_block_policy; std::vector> table_properties_collectors; std::unique_ptr pc_rep; + RelaxedAtomic worker_cpu_micros{0}; BlockCreateContext create_context; // The size of the "tail" part of a SST file. "Tail" refers to // all blocks after data blocks till the end of the SST file. uint64_t tail_size; + // The total size of all blocks in this file before they are compressed. + // This is used for logging compaction stats. + uint64_t pre_compression_size = 0; + // See class Footer uint32_t base_context_checksum; - uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } - void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } + uint64_t get_offset() { return offset.LoadRelaxed(); } + void set_offset(uint64_t o) { offset.StoreRelaxed(o); } - bool IsParallelCompressionEnabled() const { - return compression_opts.parallel_threads > 1; - } + bool IsParallelCompressionActive() const { return pc_rep != nullptr; } - Status GetStatus() { - // We need to make modifications of status visible when status_ok is set - // to false, and this is ensured by status_mutex, so no special memory - // order for status_ok is required. - if (status_ok.load(std::memory_order_relaxed)) { - return Status::OK(); - } else { - return CopyStatus(); - } - } + Status GetStatus() { return GetIOStatus(); } - Status CopyStatus() { - std::lock_guard lock(status_mutex); - return status; + bool StatusOk() { + // The OK case is optimized with an atomic. Relaxed is sufficient because + // if a thread other than the emit/compaction thread sets to non-OK it + // will synchronize that in aborting parallel compression. + bool ok = io_status_ok.LoadRelaxed(); +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (ok) { + std::lock_guard lock(io_status_mutex); + // Double-check + if (io_status_ok.LoadRelaxed()) { + io_status.PermitUncheckedError(); + assert(io_status.ok()); + } else { + ok = false; + } + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + return ok; } IOStatus GetIOStatus() { - // We need to make modifications of io_status visible when status_ok is set - // to false, and this is ensured by io_status_mutex, so no special memory - // order for io_status_ok is required. - if (io_status_ok.load(std::memory_order_relaxed)) { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED // Avoid unnecessary lock acquisition - auto ios = CopyIOStatus(); - ios.PermitUncheckedError(); - // Assume no races in unit tests - assert(ios.ok()); + // See StatusOk, which is optimized to avoid Status object copies + if (LIKELY(io_status_ok.LoadRelaxed())) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + std::lock_guard lock(io_status_mutex); + // Double-check + if (io_status_ok.LoadRelaxed()) { + io_status.PermitUncheckedError(); + assert(io_status.ok()); + } else { + return io_status; + } #endif // ROCKSDB_ASSERT_STATUS_CHECKED return IOStatus::OK(); } else { - return CopyIOStatus(); + std::lock_guard lock(io_status_mutex); + return io_status; } } - IOStatus CopyIOStatus() { - std::lock_guard lock(io_status_mutex); - return io_status; + // Avoid copying Status and IOStatus objects as much as possible. + // Never erase an existing I/O status that is not OK. + void SetStatus(Status&& s) { + if (UNLIKELY(!s.ok()) && io_status_ok.LoadRelaxed()) { + SetFailedIOStatus(status_to_io_status(std::move(s))); + } } - - // Never erase an existing status that is not OK. - void SetStatus(Status s) { - if (!s.ok() && status_ok.load(std::memory_order_relaxed)) { - // Locking is an overkill for non compression_opts.parallel_threads - // case but since it's unlikely that s is not OK, we take this cost - // to be simplicity. - std::lock_guard lock(status_mutex); - status = s; - status_ok.store(false, std::memory_order_relaxed); + void SetStatus(const Status& s) { + if (UNLIKELY(!s.ok()) && io_status_ok.LoadRelaxed()) { + SetFailedIOStatus(status_to_io_status(Status(s))); + } + } + void SetIOStatus(IOStatus&& ios) { + if (UNLIKELY(!ios.ok()) && io_status_ok.LoadRelaxed()) { + SetFailedIOStatus(std::move(ios)); + } + } + void SetIOStatus(const IOStatus& ios) { + if (UNLIKELY(!ios.ok()) && io_status_ok.LoadRelaxed()) { + SetFailedIOStatus(IOStatus(ios)); } } - // Never erase an existing I/O status that is not OK. - // Calling this will also SetStatus(ios) - void SetIOStatus(IOStatus ios) { - if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) { - // Locking is an overkill for non compression_opts.parallel_threads - // case but since it's unlikely that s is not OK, we take this cost - // to be simplicity. - std::lock_guard lock(io_status_mutex); - io_status = ios; - io_status_ok.store(false, std::memory_order_relaxed); + void SetFailedIOStatus(IOStatus&& ios) { + assert(!ios.ok()); + // Because !s.ok() is rare, locking is acceptable even in non-parallel case. + std::lock_guard lock(io_status_mutex); + // Double-check + if (io_status.ok()) { + io_status = std::move(ios); + io_status_ok.StoreRelaxed(false); } - SetStatus(ios); } Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo, @@ -457,7 +1026,6 @@ struct BlockBasedTableBuilder::Rep { persist_user_defined_timestamps( tbo.ioptions.persist_user_defined_timestamps), file(f), - offset(0), alignment(table_options.block_align ? std::min(static_cast(table_options.block_size), kDefaultPageSize) @@ -478,45 +1046,166 @@ struct BlockBasedTableBuilder::Rep { 0.75 /* data_block_hash_table_util_ratio */, ts_sz, persist_user_defined_timestamps), internal_prefix_transform(prefix_extractor.get()), - compression_type(tbo.compression_type), sample_for_compression(tbo.moptions.sample_for_compression), - compressible_input_data_bytes(0), - uncompressible_input_data_bytes(0), - sampled_input_data_bytes(0), - sampled_output_slow_data_bytes(0), - sampled_output_fast_data_bytes(0), - compression_opts(tbo.compression_opts), - compression_dict(), - compression_ctxs(tbo.compression_opts.parallel_threads), - verify_ctxs(tbo.compression_opts.parallel_threads), - verify_dict(), - state((tbo.compression_opts.max_dict_bytes > 0 && - tbo.compression_type != kNoCompression) - ? State::kBuffered - : State::kUnbuffered), + compression_parallel_threads( + ((table_opt.partition_filters && + !table_opt.decouple_partitioned_filters) || + table_options.user_defined_index_factory) + ? uint32_t{1} + : tbo.compression_opts.parallel_threads), + max_compressed_bytes_per_kb( + tbo.compression_opts.max_compressed_bytes_per_kb), use_delta_encoding_for_index_values(table_opt.format_version >= 4 && !table_opt.block_align), reason(tbo.reason), + target_file_size_is_upper_bound( + tbo.moptions.target_file_size_is_upper_bound), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( table_options, data_block)), create_context(&table_options, &ioptions, ioptions.stats, - compression_type == kZSTD, + /*decompressor=*/nullptr, tbo.moptions.block_protection_bytes_per_key, tbo.internal_comparator.user_comparator(), !use_delta_encoding_for_index_values, table_opt.index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey), - tail_size(0), - status_ok(true), - io_status_ok(true) { - if (tbo.target_file_size == 0) { - buffer_limit = compression_opts.max_dict_buffer_bytes; - } else if (compression_opts.max_dict_buffer_bytes == 0) { - buffer_limit = tbo.target_file_size; + tail_size(0) { + FilterBuildingContext filter_context(table_options); + + filter_context.info_log = ioptions.logger; + filter_context.column_family_name = tbo.column_family_name; + filter_context.reason = reason; + + // Only populate other fields if known to be in LSM rather than + // generating external SST file + if (reason != TableFileCreationReason::kMisc) { + filter_context.compaction_style = ioptions.compaction_style; + filter_context.num_levels = ioptions.num_levels; + filter_context.level_at_creation = tbo.level_at_creation; + filter_context.is_bottommost = tbo.is_bottommost; + assert(filter_context.level_at_creation < filter_context.num_levels); + } + + props.compression_options = + CompressionOptionsToString(tbo.compression_opts); + + auto* mgr = tbo.moptions.compression_manager.get(); + if (mgr == nullptr) { + uses_explicit_compression_manager = false; + mgr = GetBuiltinV2CompressionManager().get(); } else { - buffer_limit = std::min(tbo.target_file_size, - compression_opts.max_dict_buffer_bytes); + uses_explicit_compression_manager = true; + + // Stuff some extra debugging info as extra pseudo-options. Using + // underscore prefix to indicate they are special. + props.compression_options.append("_compression_manager="); + props.compression_options.append(mgr->GetId()); + props.compression_options.append("; "); + } + + // Sanitize to only allowing compression when it saves space. + max_compressed_bytes_per_kb = + std::min(int{1023}, tbo.compression_opts.max_compressed_bytes_per_kb); + + basic_compressor = mgr->GetCompressorForSST( + filter_context, tbo.compression_opts, tbo.compression_type); + if (basic_compressor) { + if (table_options.enable_index_compression) { + index_block_compressor = MaybeCloneSpecialized( + basic_compressor.get(), CacheEntryRole::kIndexBlock); + index_block_working_area.compress = + index_block_compressor->ObtainWorkingArea(); + } + data_block_dict_guidance = + basic_compressor->GetDictGuidance(CacheEntryRole::kDataBlock); + if (auto* sampling = + std::get_if(&data_block_dict_guidance); + sampling != nullptr && sampling->max_sample_bytes > 0) { + // Sampling mode: collect samples up to max_sample_bytes + state = State::kBuffered; + if (tbo.target_file_size == 0) { + buffer_limit = tbo.compression_opts.max_dict_buffer_bytes; + } else if (tbo.compression_opts.max_dict_buffer_bytes == 0) { + buffer_limit = tbo.target_file_size; + } else { + buffer_limit = std::min(tbo.target_file_size, + tbo.compression_opts.max_dict_buffer_bytes); + } + } else if (auto* predef = std::get_if( + &data_block_dict_guidance); + predef != nullptr && !predef->dict_data.empty()) { + // Pre-defined dictionary mode: use it immediately, no buffering + data_block_compressor = MaybeCloneSpecialized( + basic_compressor.get(), CacheEntryRole::kDataBlock, + Compressor::DictPreDefined{std::string{predef->dict_data}}); + data_block_working_area.compress = + data_block_compressor->ObtainWorkingArea(); + } else { + assert(std::holds_alternative( + data_block_dict_guidance) || + std::holds_alternative( + data_block_dict_guidance) || + std::holds_alternative( + data_block_dict_guidance)); + // No distinct data block compressor using dictionary, but + // implementation might still want to specialize for data blocks + data_block_compressor = MaybeCloneSpecialized( + basic_compressor.get(), CacheEntryRole::kDataBlock); + data_block_working_area.compress = + data_block_compressor->ObtainWorkingArea(); + } + basic_decompressor = basic_compressor->GetOptimizedDecompressor(); + if (basic_decompressor == nullptr) { + // Optimized version not available + basic_decompressor = mgr->GetDecompressor(); + } + create_context.decompressor = basic_decompressor.get(); + + if (table_options.verify_compression) { + verify_decompressor = basic_decompressor.get(); + if (table_options.enable_index_compression) { + index_block_working_area.verify = + verify_decompressor->ObtainWorkingArea( + index_block_compressor->GetPreferredCompressionType()); + } + if (state == State::kUnbuffered) { + assert(data_block_compressor); + data_block_verify_decompressor = verify_decompressor.get(); + data_block_working_area.verify = + data_block_verify_decompressor->ObtainWorkingArea( + data_block_compressor->GetPreferredCompressionType()); + } + } + } + + if (sample_for_compression > 0) { + auto builtin = GetBuiltinV2CompressionManager(); + if (builtin->SupportsCompressionType(kLZ4Compression)) { + fast_sample_compressor = builtin->GetCompressor({}, kLZ4Compression); + } else if (builtin->SupportsCompressionType(kSnappyCompression)) { + fast_sample_compressor = builtin->GetCompressor({}, kSnappyCompression); + } + if (builtin->SupportsCompressionType(kZSTD)) { + slow_sample_compressor = builtin->GetCompressor({}, kZSTD); + } else if (builtin->SupportsCompressionType(kZlibCompression)) { + slow_sample_compressor = builtin->GetCompressor({}, kZlibCompression); + } + // NOTE: even if both sampling compressors are nullptr, we still populate + // the table properties with placeholder info + } + + switch (table_options.prepopulate_block_cache) { + case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly: + warm_cache = (reason == TableFileCreationReason::kFlush); + break; + case BlockBasedTableOptions::PrepopulateBlockCache::kDisable: + warm_cache = false; + break; + default: + // missing case + assert(false); + warm_cache = false; } const auto compress_dict_build_buffer_charged = @@ -536,11 +1225,6 @@ struct BlockBasedTableBuilder::Rep { compression_dict_buffer_cache_res_mgr = nullptr; } - assert(compression_ctxs.size() >= compression_opts.parallel_threads); - for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { - compression_ctxs[i].reset( - new CompressionContext(compression_type, compression_opts)); - } if (table_options.index_type == BlockBasedTableOptions::kTwoLevelIndexSearch) { p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( @@ -553,33 +1237,42 @@ struct BlockBasedTableBuilder::Rep { &this->internal_prefix_transform, use_delta_encoding_for_index_values, table_options, ts_sz, persist_user_defined_timestamps)); } + + // If user_defined_index_factory is provided, wrap the index builder with + // UserDefinedIndexWrapper + if (table_options.user_defined_index_factory != nullptr) { + if (tbo.moptions.compression_opts.parallel_threads > 1 || + tbo.moptions.bottommost_compression_opts.parallel_threads > 1) { + SetStatus( + Status::InvalidArgument("user_defined_index_factory not supported " + "with parallel compression")); + } else { + std::unique_ptr user_defined_index_builder; + UserDefinedIndexOption udi_options; + udi_options.comparator = internal_comparator.user_comparator(); + auto s = table_options.user_defined_index_factory->NewBuilder( + udi_options, user_defined_index_builder); + if (!s.ok()) { + SetStatus(s); + } else { + if (user_defined_index_builder != nullptr) { + index_builder = std::make_unique( + std::string(table_options.user_defined_index_factory->Name()), + std::move(index_builder), std::move(user_defined_index_builder), + &internal_comparator, ts_sz, persist_user_defined_timestamps); + } + } + } + } + if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) { // Apply optimize_filters_for_hits setting here when applicable by // skipping filter generation filter_builder.reset(); - } else if (tbo.skip_filters) { - // For SstFileWriter skip_filters - filter_builder.reset(); } else if (!table_options.filter_policy) { // Null filter_policy -> no filter filter_builder.reset(); } else { - FilterBuildingContext filter_context(table_options); - - filter_context.info_log = ioptions.logger; - filter_context.column_family_name = tbo.column_family_name; - filter_context.reason = reason; - - // Only populate other fields if known to be in LSM rather than - // generating external SST file - if (reason != TableFileCreationReason::kMisc) { - filter_context.compaction_style = ioptions.compaction_style; - filter_context.num_levels = ioptions.num_levels; - filter_context.level_at_creation = tbo.level_at_creation; - filter_context.is_bottommost = tbo.is_bottommost; - assert(filter_context.level_at_creation < filter_context.num_levels); - } - filter_builder.reset(CreateFilterBlockBuilder( ioptions, tbo.moptions, filter_context, use_delta_encoding_for_index_values, p_index_builder_, ts_sz, @@ -600,20 +1293,15 @@ struct BlockBasedTableBuilder::Rep { } } table_properties_collectors.emplace_back( - new BlockBasedTablePropertiesCollector( + std::make_unique( table_options.index_type, table_options.whole_key_filtering, prefix_extractor != nullptr, table_options.decouple_partitioned_filters)); if (ts_sz > 0 && persist_user_defined_timestamps) { table_properties_collectors.emplace_back( - new TimestampTablePropertiesCollector( + std::make_unique( tbo.internal_comparator.user_comparator())); } - if (table_options.verify_compression) { - for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { - verify_ctxs[i].reset(new UncompressionContext(compression_type)); - } - } // These are only needed for populating table properties props.column_family_id = tbo.column_family_id; @@ -632,6 +1320,9 @@ struct BlockBasedTableBuilder::Rep { // Default is UINT64_MAX for unknown. Setting it to 0 here // to allow updating it by taking max in BlockBasedTableBuilder::Add(). props.key_largest_seqno = 0; + // Default is UINT64_MAX for unknown. + props.key_smallest_seqno = UINT64_MAX; + PrePopulateCompressionProperties(mgr); if (FormatVersionUsesContextChecksum(table_options.format_version)) { // Must be non-zero and semi- or quasi-random @@ -644,7 +1335,7 @@ struct BlockBasedTableBuilder::Rep { base_context_checksum = 0; } - if (alignment > 0 && compression_type != kNoCompression) { + if (alignment > 0 && basic_compressor) { // With better sanitization in `CompactionPicker::CompactFiles()`, we // would not need to handle this case here and could change it to an // assertion instead. @@ -653,347 +1344,129 @@ struct BlockBasedTableBuilder::Rep { } } - Rep(const Rep&) = delete; - Rep& operator=(const Rep&) = delete; - - private: - // Synchronize status & io_status accesses across threads from main thread, - // compression thread and write thread in parallel compression. - std::mutex status_mutex; - std::atomic status_ok; - Status status; - std::mutex io_status_mutex; - std::atomic io_status_ok; - IOStatus io_status; -}; - -struct BlockBasedTableBuilder::ParallelCompressionRep { - // TODO: consider replacing with autovector or similar - // Keys is a wrapper of vector of strings avoiding - // releasing string memories during vector clear() - // in order to save memory allocation overhead - class Keys { - public: - Keys() : keys_(kKeysInitSize), size_(0) {} - void PushBack(const Slice& key) { - if (size_ == keys_.size()) { - keys_.emplace_back(key.data(), key.size()); - } else { - keys_[size_].assign(key.data(), key.size()); - } - size_++; - } - void SwapAssign(std::vector& keys) { - size_ = keys.size(); - std::swap(keys_, keys); - } - void Clear() { size_ = 0; } - size_t Size() { return size_; } - std::string& Back() { return keys_[size_ - 1]; } - std::string& operator[](size_t idx) { - assert(idx < size_); - return keys_[idx]; - } - - private: - const size_t kKeysInitSize = 32; - std::vector keys_; - size_t size_; - }; - std::unique_ptr curr_block_keys; - - class BlockRepSlot; - - // BlockRep instances are fetched from and recycled to - // block_rep_pool during parallel compression. - struct BlockRep { - Slice contents; - Slice compressed_contents; - std::unique_ptr data; - std::unique_ptr compressed_data; - CompressionType compression_type; - std::unique_ptr first_key_in_next_block; - std::unique_ptr keys; - std::unique_ptr slot; - Status status; - }; - // Use a vector of BlockRep as a buffer for a determined number - // of BlockRep structures. All data referenced by pointers in - // BlockRep will be freed when this vector is destructed. - using BlockRepBuffer = std::vector; - BlockRepBuffer block_rep_buf; - // Use a thread-safe queue for concurrent access from block - // building thread and writer thread. - using BlockRepPool = WorkQueue; - BlockRepPool block_rep_pool; - - // Use BlockRepSlot to keep block order in write thread. - // slot_ will pass references to BlockRep - class BlockRepSlot { - public: - BlockRepSlot() : slot_(1) {} - template - void Fill(T&& rep) { - slot_.push(std::forward(rep)); - } - void Take(BlockRep*& rep) { slot_.pop(rep); } - - private: - // slot_ will pass references to BlockRep in block_rep_buf, - // and those references are always valid before the destruction of - // block_rep_buf. - WorkQueue slot_; - }; - - // Compression queue will pass references to BlockRep in block_rep_buf, - // and those references are always valid before the destruction of - // block_rep_buf. - using CompressQueue = WorkQueue; - CompressQueue compress_queue; - std::vector compress_thread_pool; - - // Write queue will pass references to BlockRep::slot in block_rep_buf, - // and those references are always valid before the corresponding - // BlockRep::slot is destructed, which is before the destruction of - // block_rep_buf. - using WriteQueue = WorkQueue; - WriteQueue write_queue; - std::unique_ptr write_thread; - - // Estimate output file size when parallel compression is enabled. This is - // necessary because compression & flush are no longer synchronized, - // and BlockBasedTableBuilder::FileSize() is no longer accurate. - // memory_order_relaxed suffices because accurate statistics is not required. - class FileSizeEstimator { - public: - explicit FileSizeEstimator() - : uncomp_bytes_compressed(0), - uncomp_bytes_curr_block(0), - uncomp_bytes_curr_block_set(false), - uncomp_bytes_inflight(0), - blocks_inflight(0), - curr_compression_ratio(0), - estimated_file_size(0) {} - - // Estimate file size when a block is about to be emitted to - // compression thread - void EmitBlock(uint64_t uncomp_block_size, uint64_t curr_file_size) { - uint64_t new_uncomp_bytes_inflight = - uncomp_bytes_inflight.fetch_add(uncomp_block_size, - std::memory_order_relaxed) + - uncomp_block_size; - - uint64_t new_blocks_inflight = - blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1; - - estimated_file_size.store( - curr_file_size + - static_cast( - static_cast(new_uncomp_bytes_inflight) * - curr_compression_ratio.load(std::memory_order_relaxed)) + - new_blocks_inflight * kBlockTrailerSize, - std::memory_order_relaxed); - } - - // Estimate file size when a block is already reaped from - // compression thread - void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) { - assert(uncomp_bytes_curr_block_set); - - uint64_t new_uncomp_bytes_compressed = - uncomp_bytes_compressed + uncomp_bytes_curr_block; - assert(new_uncomp_bytes_compressed > 0); - - curr_compression_ratio.store( - (curr_compression_ratio.load(std::memory_order_relaxed) * - uncomp_bytes_compressed + - compressed_block_size) / - static_cast(new_uncomp_bytes_compressed), - std::memory_order_relaxed); - uncomp_bytes_compressed = new_uncomp_bytes_compressed; - - uint64_t new_uncomp_bytes_inflight = - uncomp_bytes_inflight.fetch_sub(uncomp_bytes_curr_block, - std::memory_order_relaxed) - - uncomp_bytes_curr_block; - - uint64_t new_blocks_inflight = - blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1; - - estimated_file_size.store( - curr_file_size + - static_cast( - static_cast(new_uncomp_bytes_inflight) * - curr_compression_ratio.load(std::memory_order_relaxed)) + - new_blocks_inflight * kBlockTrailerSize, - std::memory_order_relaxed); - - uncomp_bytes_curr_block_set = false; - } - - void SetEstimatedFileSize(uint64_t size) { - estimated_file_size.store(size, std::memory_order_relaxed); - } - - uint64_t GetEstimatedFileSize() { - return estimated_file_size.load(std::memory_order_relaxed); - } - - void SetCurrBlockUncompSize(uint64_t size) { - uncomp_bytes_curr_block = size; - uncomp_bytes_curr_block_set = true; - } - - private: - // Input bytes compressed so far. - uint64_t uncomp_bytes_compressed; - // Size of current block being appended. - uint64_t uncomp_bytes_curr_block; - // Whether uncomp_bytes_curr_block has been set for next - // ReapBlock call. - bool uncomp_bytes_curr_block_set; - // Input bytes under compression and not appended yet. - std::atomic uncomp_bytes_inflight; - // Number of blocks under compression and not appended yet. - std::atomic blocks_inflight; - // Current compression ratio, maintained by BGWorkWriteMaybeCompressedBlock. - std::atomic curr_compression_ratio; - // Estimated SST file size. - std::atomic estimated_file_size; - }; - FileSizeEstimator file_size_estimator; - - // Facilities used for waiting first block completion. Need to Wait for - // the completion of first block compression and flush to get a non-zero - // compression ratio. - std::atomic first_block_processed; - std::condition_variable first_block_cond; - std::mutex first_block_mutex; - - explicit ParallelCompressionRep(uint32_t parallel_threads) - : curr_block_keys(new Keys()), - block_rep_buf(parallel_threads), - block_rep_pool(parallel_threads), - compress_queue(parallel_threads), - write_queue(parallel_threads), - first_block_processed(false) { - for (uint32_t i = 0; i < parallel_threads; i++) { - block_rep_buf[i].contents = Slice(); - block_rep_buf[i].compressed_contents = Slice(); - block_rep_buf[i].data.reset(new std::string()); - block_rep_buf[i].compressed_data.reset(new std::string()); - block_rep_buf[i].compression_type = CompressionType(); - block_rep_buf[i].first_key_in_next_block.reset(new std::string()); - block_rep_buf[i].keys.reset(new Keys()); - block_rep_buf[i].slot.reset(new BlockRepSlot()); - block_rep_buf[i].status = Status::OK(); - block_rep_pool.push(&block_rep_buf[i]); - } - } - - ~ParallelCompressionRep() { block_rep_pool.finish(); } - - // Make a block prepared to be emitted to compression thread - // Used in non-buffered mode - BlockRep* PrepareBlock(CompressionType compression_type, - const Slice* first_key_in_next_block, - BlockBuilder* data_block) { - BlockRep* block_rep = - PrepareBlockInternal(compression_type, first_key_in_next_block); - assert(block_rep != nullptr); - data_block->SwapAndReset(*(block_rep->data)); - block_rep->contents = *(block_rep->data); - std::swap(block_rep->keys, curr_block_keys); - curr_block_keys->Clear(); - return block_rep; - } - - // Used in EnterUnbuffered - BlockRep* PrepareBlock(CompressionType compression_type, - const Slice* first_key_in_next_block, - std::string* data_block, - std::vector* keys) { - BlockRep* block_rep = - PrepareBlockInternal(compression_type, first_key_in_next_block); - assert(block_rep != nullptr); - std::swap(*(block_rep->data), *data_block); - block_rep->contents = *(block_rep->data); - block_rep->keys->SwapAssign(*keys); - return block_rep; - } - - // Emit a block to compression thread - void EmitBlock(BlockRep* block_rep) { - assert(block_rep != nullptr); - assert(block_rep->status.ok()); - if (!write_queue.push(block_rep->slot.get())) { - return; - } - if (!compress_queue.push(block_rep)) { - return; + ~Rep() { + // Delete working areas before their compressors. + index_block_working_area = {}; + data_block_working_area = {}; + // Must have been cleaned up by StopParallelCompression + assert(pc_rep == nullptr); + // Delete specialized compressors if they were distinct (avoiding extra + // fields and interlocked instructions with shared_ptr) + if (data_block_compressor.get() != basic_compressor.get()) { + delete data_block_compressor.get(); } - - if (!first_block_processed.load(std::memory_order_relaxed)) { - std::unique_lock lock(first_block_mutex); - first_block_cond.wait(lock, [this] { - return first_block_processed.load(std::memory_order_relaxed); - }); + if (index_block_compressor.get() != basic_compressor.get()) { + delete index_block_compressor.get(); } } - // Reap a block from compression thread - void ReapBlock(BlockRep* block_rep) { - assert(block_rep != nullptr); - block_rep->compressed_data->clear(); - block_rep_pool.push(block_rep); + Rep(const Rep&) = delete; + Rep& operator=(const Rep&) = delete; - if (!first_block_processed.load(std::memory_order_relaxed)) { - std::lock_guard lock(first_block_mutex); - first_block_processed.store(true, std::memory_order_relaxed); - first_block_cond.notify_one(); + void PrePopulateCompressionProperties(UnownedPtr mgr) { + if (FormatVersionUsesCompressionManagerName(table_options.format_version)) { + assert(mgr); + // Use newer compression_name property + props.compression_name.reserve(32); + // If compression is disabled, use empty manager name + if (basic_compressor) { + props.compression_name.append(mgr->CompatibilityName()); + } + props.compression_name.push_back(';'); + // Rest of property to be filled out at the end of building the file + } else { + // Use legacy compression_name property, populated at the end of + // building the file. Not compatible with compression managers using + // custom algorithms / compression types. + assert( + Slice(mgr->CompatibilityName()) + .compare(GetBuiltinV2CompressionManager()->CompatibilityName()) == + 0); + } + } + void PostPopulateCompressionProperties() { + // Do not include "no compression" in the set. It's not really useful + // information whether there are any uncompressed blocks. Some kinds of + // blocks are never compressed anyway. + compression_types_used.Remove(kNoCompression); + size_t ctype_count = compression_types_used.count(); + + if (uses_explicit_compression_manager) { + // Stuff some extra debugging info as extra pseudo-options. Using + // underscore prefix to indicate they are special. + std::string& compression_options = props.compression_options; + compression_options.append("_compressor="); + compression_options.append(data_block_compressor + ? data_block_compressor->GetId() + : std::string{}); + compression_options.append("; "); + } else { + // No explicit compression manager + assert(compression_types_used.count() <= 1); + } + + std::string& compression_name = props.compression_name; + if (FormatVersionUsesCompressionManagerName(table_options.format_version)) { + // Fill in extended field of "compression name" property, which is the + // set of compression types used, sorted by unsigned byte and then hex + // encoded with two digits each (so that table properties are human + // readable). + assert(*compression_name.rbegin() == ';'); + size_t pos = compression_name.size(); + // Make space for the field contents + compression_name.append(ctype_count * 2, '\0'); + char* ptr = compression_name.data() + pos; + // Populate the field contents + for (CompressionType t : compression_types_used) { + PutBaseChars<16>(&ptr, /*n=*/2, static_cast(t), + /*uppercase=*/true); + } + assert(ptr == compression_name.data() + pos + ctype_count * 2); + // Allow additional fields in the future + compression_name.push_back(';'); + } else { + // Use legacy compression naming. To adhere to requirements described in + // TableProperties::compression_name, we might have to replace the name + // based on the legacy configured compression type. + assert(compression_name.empty()); + if (ctype_count == 0) { + // We could get a slight performance boost in the reader by marking + // the file as "no compression" if compression is configured but + // consistently rejected, but that would give misleading info for + // debugging purposes. So instead we record the configured compression + // type, matching the historical behavior. + if (data_block_compressor) { + compression_name = CompressionTypeToString( + data_block_compressor->GetPreferredCompressionType()); + } else { + assert(basic_compressor == nullptr); + compression_name = CompressionTypeToString(kNoCompression); + } + } else if (compression_types_used.Contains(kZSTD)) { + compression_name = CompressionTypeToString(kZSTD); + } else { + compression_name = + CompressionTypeToString(*compression_types_used.begin()); + } } } private: - BlockRep* PrepareBlockInternal(CompressionType compression_type, - const Slice* first_key_in_next_block) { - BlockRep* block_rep = nullptr; - block_rep_pool.pop(block_rep); - assert(block_rep != nullptr); - - assert(block_rep->data); - - block_rep->compression_type = compression_type; - - if (first_key_in_next_block == nullptr) { - block_rep->first_key_in_next_block.reset(nullptr); - } else { - block_rep->first_key_in_next_block->assign( - first_key_in_next_block->data(), first_key_in_next_block->size()); - } - - return block_rep; - } + // Synchronize io_status to be readable/writable across threads, but + // optimize for the OK case + std::mutex io_status_mutex; + RelaxedAtomic io_status_ok{true}; + IOStatus io_status; }; BlockBasedTableBuilder::BlockBasedTableBuilder( const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo, WritableFileWriter* file) { BlockBasedTableOptions sanitized_table_options(table_options); - if (sanitized_table_options.format_version == 0 && - sanitized_table_options.checksum != kCRC32c) { - ROCKS_LOG_WARN( - tbo.ioptions.logger, - "Silently converting format_version to 1 because checksum is " - "non-default"); - // silently convert format_version to 1 to keep consistent with current - // behavior - sanitized_table_options.format_version = 1; - } auto ucmp = tbo.internal_comparator.user_comparator(); assert(ucmp); (void)ucmp; // avoids unused variable error. - rep_ = new Rep(sanitized_table_options, tbo, file); + rep_ = std::make_unique(sanitized_table_options, tbo, file); TEST_SYNC_POINT_CALLBACK( "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey", @@ -1002,92 +1475,58 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( BlockBasedTable::SetupBaseCacheKey(&rep_->props, tbo.db_session_id, tbo.cur_file_num, &rep_->base_cache_key); - if (rep_->IsParallelCompressionEnabled()) { - StartParallelCompression(); + MaybeStartParallelCompression(); + if (!rep_->IsParallelCompressionActive() && rep_->basic_compressor) { + rep_->single_threaded_compressed_output.ResetForSize( + table_options.block_size); } } BlockBasedTableBuilder::~BlockBasedTableBuilder() { // Catch errors where caller forgot to call Finish() assert(rep_->state == Rep::State::kClosed); - delete rep_; } void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) { - Rep* r = rep_; + Rep* r = rep_.get(); assert(rep_->state != Rep::State::kClosed); - if (!ok()) { + if (UNLIKELY(!ok())) { return; } ValueType value_type; SequenceNumber seq; UnPackSequenceAndType(ExtractInternalKeyFooter(ikey), &seq, &value_type); r->props.key_largest_seqno = std::max(r->props.key_largest_seqno, seq); + r->props.key_smallest_seqno = std::min(r->props.key_smallest_seqno, seq); if (IsValueType(value_type)) { #ifndef NDEBUG if (r->props.num_entries > r->props.num_range_deletions) { assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0); } + bool skip = false; + TEST_SYNC_POINT_CALLBACK("BlockBasedTableBuilder::Add::skip", (void*)&skip); + if (skip) { + return; + } #endif // !NDEBUG auto should_flush = r->flush_block_policy->Update(ikey, value); if (should_flush) { assert(!r->data_block.empty()); - r->first_key_in_next_block = &ikey; - Flush(); - if (r->state == Rep::State::kBuffered) { - bool exceeds_buffer_limit = - (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit); - bool exceeds_global_block_cache_limit = false; - - // Increase cache charging for the last buffered data block - // only if the block is not going to be unbuffered immediately - // and there exists a cache reservation manager - if (!exceeds_buffer_limit && - r->compression_dict_buffer_cache_res_mgr != nullptr) { - Status s = - r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation( - r->data_begin_offset); - exceeds_global_block_cache_limit = s.IsMemoryLimit(); - } - - if (exceeds_buffer_limit || exceeds_global_block_cache_limit) { - EnterUnbuffered(); - } - } - - // Add item to index block. - // We do not emit the index entry for a block until we have seen the - // first key for the next data block. This allows us to use shorter - // keys in the index block. For example, consider a block boundary - // between the keys "the quick brown fox" and "the who". We can use - // "the r" as the key for the index block entry since it is >= all - // entries in the first block and < all entries in subsequent - // blocks. - if (ok() && r->state == Rep::State::kUnbuffered) { - if (r->IsParallelCompressionEnabled()) { - r->pc_rep->curr_block_keys->Clear(); - } else { - r->index_builder->AddIndexEntry(r->last_ikey, &ikey, - r->pending_handle, - &r->index_separator_scratch); - } - } + Flush(/*first_key_in_next_block=*/&ikey); } - // Note: PartitionedFilterBlockBuilder requires key being added to filter - // builder after being added to index builder. + // Note: PartitionedFilterBlockBuilder with + // decouple_partitioned_filters=false requires key being added to filter + // builder after being added to and "finished" in the index builder, so + // forces no parallel compression (logic in Rep constructor). if (r->state == Rep::State::kUnbuffered) { - if (r->IsParallelCompressionEnabled()) { - r->pc_rep->curr_block_keys->PushBack(ikey); - } else { - if (r->filter_builder != nullptr) { - r->filter_builder->AddWithPrevKey( - ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz), - r->last_ikey.empty() - ? Slice{} - : ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz)); - } + if (r->filter_builder != nullptr) { + r->filter_builder->AddWithPrevKey( + ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz), + r->last_ikey.empty() + ? Slice{} + : ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz)); } } @@ -1098,9 +1537,7 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) { // Buffered keys will be replayed from data_block_buffers during // `Finish()` once compression dictionary has been finalized. } else { - if (!r->IsParallelCompressionEnabled()) { - r->index_builder->OnKeyAdded(ikey); - } + r->index_builder->OnKeyAdded(ikey, value); } // TODO offset passed in is not accurate for parallel compression case NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(), @@ -1147,214 +1584,405 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) { } } -void BlockBasedTableBuilder::Flush() { - Rep* r = rep_; +void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) { + Rep* r = rep_.get(); assert(rep_->state != Rep::State::kClosed); - if (!ok()) { + if (UNLIKELY(!ok())) { return; } if (r->data_block.empty()) { return; } - if (r->IsParallelCompressionEnabled() && - r->state == Rep::State::kUnbuffered) { - r->data_block.Finish(); - ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock( - r->compression_type, r->first_key_in_next_block, &(r->data_block)); - assert(block_rep != nullptr); - r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(), - r->get_offset()); - r->pc_rep->EmitBlock(block_rep); + Slice uncompressed_block_data = r->data_block.Finish(); + + // NOTE: compression sampling is done here in the same thread as building + // the uncompressed block because of the requirements to call table + // property collectors: + // * BlockAdd function expects block_compressed_bytes_{fast,slow} for + // historical reasons. Probably a hassle to remove. + // * Collector is not thread safe so calls need to be + // serialized/synchronized. + // * Ideally, AddUserKey and BlockAdd calls need to line up such that a + // reported block corresponds to all the keys reported since the previous + // block. + + // If requested, we sample one in every N block with a + // fast and slow compression algorithm and report the stats. + // The users can use these stats to decide if it is worthwhile + // enabling compression and they also get a hint about which + // compression algorithm wil be beneficial. + if (r->sample_for_compression > 0 && + Random::GetTLSInstance()->OneIn( + static_cast(r->sample_for_compression))) { + GrowableBuffer sampled_output; + sampled_output.ResetForSize(uncompressed_block_data.size()); + size_t fast_size = uncompressed_block_data.size(); + size_t slow_size = uncompressed_block_data.size(); + + // Sampling with a fast compression algorithm + if (r->fast_sample_compressor) { + CompressionType result_type = kNoCompression; + Status s = r->fast_sample_compressor->CompressBlock( + uncompressed_block_data, sampled_output.data(), &fast_size, + &result_type, /*working_area=*/nullptr); + if (!s.ok() || result_type == kNoCompression) { + // For accounting, fall back on no compression + fast_size = uncompressed_block_data.size(); + } + } + + // Sampling with a slow but high-compression algorithm + if (r->slow_sample_compressor) { + CompressionType result_type = kNoCompression; + Status s = r->slow_sample_compressor->CompressBlock( + uncompressed_block_data, sampled_output.data(), &slow_size, + &result_type, /*working_area=*/nullptr); + if (!s.ok() || result_type == kNoCompression) { + // For accounting, fall back on no compression + slow_size = uncompressed_block_data.size(); + } + } + + // NOTE: Currently compression sampling is only enabled for data block. + r->sampled_input_data_bytes.FetchAddRelaxed(uncompressed_block_data.size()); + r->sampled_output_slow_data_bytes.FetchAddRelaxed(slow_size); + r->sampled_output_fast_data_bytes.FetchAddRelaxed(fast_size); + + NotifyCollectTableCollectorsOnBlockAdd(r->table_properties_collectors, + uncompressed_block_data.size(), + slow_size, fast_size); } else { - WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData); + NotifyCollectTableCollectorsOnBlockAdd( + r->table_properties_collectors, uncompressed_block_data.size(), + 0 /*block_compressed_bytes_slow*/, 0 /*block_compressed_bytes_fast*/); } -} -void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, - BlockHandle* handle, - BlockType block_type) { - block->Finish(); - std::string uncompressed_block_data; - uncompressed_block_data.reserve(rep_->table_options.block_size); - block->SwapAndReset(uncompressed_block_data); if (rep_->state == Rep::State::kBuffered) { - assert(block_type == BlockType::kData); - rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_data)); - rep_->data_begin_offset += rep_->data_block_buffers.back().size(); - return; + std::string uncompressed_block_holder; + uncompressed_block_holder.reserve(rep_->table_options.block_size); + r->data_block.SwapAndReset(uncompressed_block_holder); + assert(uncompressed_block_data.size() == uncompressed_block_holder.size()); + rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_holder)); + rep_->data_begin_offset += uncompressed_block_data.size(); + MaybeEnterUnbuffered(first_key_in_next_block); + } else { + // Increment num_data_blocks when a data block is finalized in the + // emit thread to avoid data races with write worker threads + ++r->props.num_data_blocks; + + // Notify filter builder that a data block has been finalized + // This must happen on the emit thread before the block is added to the + // ring buffer to avoid race conditions with worker threads + if (r->filter_builder) { + r->filter_builder->OnDataBlockFinalized(r->props.num_data_blocks); + } + + if (r->IsParallelCompressionActive()) { + EmitBlockForParallel(r->data_block.MutableBuffer(), r->last_ikey, + first_key_in_next_block); + } else { + EmitBlock(r->data_block.MutableBuffer(), r->last_ikey, + first_key_in_next_block); + } + r->data_block.Reset(); + } +} + +void BlockBasedTableBuilder::EmitBlockForParallel( + std::string& uncompressed, const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block) { + Rep* r = rep_.get(); + assert(r->state == Rep::State::kUnbuffered); + assert(uncompressed.size() > 0); + auto& pc_rep = *r->pc_rep; + // Can emit the uncompressed block into the ring buffer + assert(pc_rep.emit_thread_state == + ParallelCompressionRep::ThreadState::kEmitting); + auto* block_rep = &pc_rep.ring_buffer[pc_rep.emit_slot]; + pc_rep.estimated_inflight_size.FetchAddRelaxed(uncompressed.size() + + kBlockTrailerSize); + std::swap(uncompressed, block_rep->uncompressed); + r->index_builder->PrepareIndexEntry(last_key_in_current_block, + first_key_in_next_block, + block_rep->prepared_index_entry.get()); + block_rep->compressed.Reset(); + block_rep->compression_type = kNoCompression; + + // Might need to take up some compression work before we are able to + // resume emitting the next uncompressed block. + for (;;) { + pc_rep.EmitterStateTransition(pc_rep.emit_thread_state, pc_rep.emit_slot); + + if (pc_rep.emit_thread_state == + ParallelCompressionRep::ThreadState::kCompressing) { + // Took up some compression work to help unblock ourself + block_rep = &pc_rep.ring_buffer[pc_rep.emit_slot]; + Status s = CompressAndVerifyBlock( + block_rep->uncompressed, /*is_data_block=*/true, + r->data_block_working_area, &block_rep->compressed, + &block_rep->compression_type); + if (UNLIKELY(!s.ok())) { + r->SetStatus(s); + pc_rep.SetAbort(pc_rep.emit_thread_state); + break; + } + } else { + assert(pc_rep.emit_thread_state != + ParallelCompressionRep::ThreadState::kCompressingAndWriting); + assert(pc_rep.emit_thread_state != + ParallelCompressionRep::ThreadState::kWriting); + assert(pc_rep.emit_thread_state != + ParallelCompressionRep::ThreadState::kIdle); + // Either emitting or end state. + // Detect nothing more to emit and set if so. + if (first_key_in_next_block == nullptr && + pc_rep.emit_thread_state == + ParallelCompressionRep::ThreadState::kEmitting) { + pc_rep.SetNoMoreToEmit(pc_rep.emit_thread_state, pc_rep.emit_slot); + } + break; + } + } +} +void BlockBasedTableBuilder::EmitBlock(std::string& uncompressed, + const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block) { + Rep* r = rep_.get(); + assert(r->state == Rep::State::kUnbuffered); + // Single-threaded context only + assert(!r->IsParallelCompressionActive()); + assert(uncompressed.size() > 0); + // When data blocks are aligned with super block alignment, delta encoding + // needs to be skipped for the first block after padding. + bool skip_delta_encoding = false; + WriteBlock(uncompressed, &r->pending_handle, BlockType::kData, + &skip_delta_encoding); + if (LIKELY(ok())) { + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + r->index_builder->AddIndexEntry( + last_key_in_current_block, first_key_in_next_block, r->pending_handle, + &r->index_separator_scratch, skip_delta_encoding); } - WriteBlock(uncompressed_block_data, handle, block_type); } void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data, BlockHandle* handle, - BlockType block_type) { - Rep* r = rep_; + BlockType block_type, + bool* skip_delta_encoding) { + Rep* r = rep_.get(); assert(r->state == Rep::State::kUnbuffered); - Slice block_contents; + // Single-threaded context only + assert(!r->IsParallelCompressionActive()); CompressionType type; - Status compress_status; bool is_data_block = block_type == BlockType::kData; - CompressAndVerifyBlock(uncompressed_block_data, is_data_block, - *(r->compression_ctxs[0]), r->verify_ctxs[0].get(), - &(r->compressed_output), &(block_contents), &type, - &compress_status); + // NOTE: only index and data blocks are currently compressed + assert(is_data_block || block_type == BlockType::kIndex); + Status compress_status = CompressAndVerifyBlock( + uncompressed_block_data, is_data_block, + is_data_block ? r->data_block_working_area : r->index_block_working_area, + &r->single_threaded_compressed_output, &type); r->SetStatus(compress_status); - if (!ok()) { + if (UNLIKELY(!ok())) { return; } TEST_SYNC_POINT_CALLBACK( "BlockBasedTableBuilder::WriteBlock:TamperWithCompressedData", - &r->compressed_output); - WriteMaybeCompressedBlock(block_contents, type, handle, block_type, - &uncompressed_block_data); - r->compressed_output.clear(); + &r->single_threaded_compressed_output); + WriteMaybeCompressedBlock( + type == kNoCompression ? uncompressed_block_data + : Slice(r->single_threaded_compressed_output), + type, handle, block_type, &uncompressed_block_data, skip_delta_encoding); + r->single_threaded_compressed_output.Reset(); if (is_data_block) { r->props.data_size = r->get_offset(); - ++r->props.num_data_blocks; + r->props.uncompressed_data_size += uncompressed_block_data.size(); } } -void BlockBasedTableBuilder::BGWorkCompression( - const CompressionContext& compression_ctx, - UncompressionContext* verify_ctx) { - ParallelCompressionRep::BlockRep* block_rep = nullptr; - while (rep_->pc_rep->compress_queue.pop(block_rep)) { - assert(block_rep != nullptr); - CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/ - compression_ctx, verify_ctx, - block_rep->compressed_data.get(), - &block_rep->compressed_contents, - &(block_rep->compression_type), &block_rep->status); - block_rep->slot->Fill(block_rep); - } +uint64_t BlockBasedTableBuilder::GetWorkerCPUMicros() const { + return rep_->worker_cpu_micros.LoadRelaxed(); } -void BlockBasedTableBuilder::CompressAndVerifyBlock( - const Slice& uncompressed_block_data, bool is_data_block, - const CompressionContext& compression_ctx, UncompressionContext* verify_ctx, - std::string* compressed_output, Slice* block_contents, - CompressionType* type, Status* out_status) { - Rep* r = rep_; - bool is_status_ok = ok(); - if (!r->IsParallelCompressionEnabled()) { - assert(is_status_ok); - } +void BlockBasedTableBuilder::BGWorker(WorkingAreaPair& working_area) { + // Record CPU usage of this thread + const uint64_t start_cpu_micros = + rep_->ioptions.env->GetSystemClock()->CPUMicros(); + Defer log_cpu{[this, start_cpu_micros]() { + rep_->worker_cpu_micros.FetchAddRelaxed( + rep_->ioptions.env->GetSystemClock()->CPUMicros() - start_cpu_micros); + }}; + + auto& pc_rep = *rep_->pc_rep; +#ifdef BBTB_PC_WATCHDOG + pc_rep.live_workers.FetchAddRelaxed(1); + Defer decr{[&pc_rep]() { pc_rep.live_workers.FetchSubRelaxed(1); }}; +#endif // BBTB_PC_WATCHDOG + ParallelCompressionRep::ThreadState thread_state = + ParallelCompressionRep::ThreadState::kIdle; + uint32_t slot = 0; + // Workers should avoid checking the shared status (e.g. ok()) to minimize + // potential data dependencies across threads. If another thread hits an + // error, we will pick up the kEnd state from the abort. + IOStatus ios; + do { + pc_rep.WorkerStateTransition(thread_state, slot); + ParallelCompressionRep::BlockRep* block_rep = &pc_rep.ring_buffer[slot]; + auto compress_fn = [this, block_rep, &ios, &working_area]() { + ios = status_to_io_status(CompressAndVerifyBlock( + block_rep->uncompressed, /*is_data_block=*/true, working_area, + &block_rep->compressed, &block_rep->compression_type)); + }; + auto write_fn = [this, block_rep, &ios]() { + Slice compressed = block_rep->compressed; + Slice uncompressed = block_rep->uncompressed; + bool skip_delta_encoding = false; + ios = WriteMaybeCompressedBlockImpl( + block_rep->compression_type == kNoCompression ? uncompressed + : compressed, + block_rep->compression_type, &rep_->pending_handle, BlockType::kData, + &uncompressed, &skip_delta_encoding); + if (LIKELY(ios.ok())) { + rep_->props.data_size = rep_->get_offset(); + rep_->props.uncompressed_data_size += block_rep->uncompressed.size(); + + rep_->index_builder->FinishIndexEntry( + rep_->pending_handle, block_rep->prepared_index_entry.get(), + skip_delta_encoding); + } + }; + switch (thread_state) { + case ParallelCompressionRep::ThreadState::kEnd: + // All done + assert(ios.ok()); + return; + case ParallelCompressionRep::ThreadState::kCompressing: + compress_fn(); + break; + case ParallelCompressionRep::ThreadState::kCompressingAndWriting: + compress_fn(); + if (LIKELY(ios.ok())) { + write_fn(); + } + break; + case ParallelCompressionRep::ThreadState::kWriting: + write_fn(); + break; + case ParallelCompressionRep::ThreadState::kEmitting: + // Shouldn't happen + assert(thread_state != ParallelCompressionRep::ThreadState::kEmitting); + break; + case ParallelCompressionRep::ThreadState::kIdle: + // Shouldn't happen + assert(thread_state != ParallelCompressionRep::ThreadState::kIdle); + break; + default: + assert(false); + break; + } + } while (LIKELY(ios.ok())); + // Hit an error, so abort + rep_->SetIOStatus(ios); + pc_rep.SetAbort(thread_state); +} - if (is_status_ok && uncompressed_block_data.size() < kCompressionSizeLimit) { - StopWatchNano timer( - r->ioptions.clock, - ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)); +Status BlockBasedTableBuilder::CompressAndVerifyBlock( + const Slice& uncompressed_block_data, bool is_data_block, + WorkingAreaPair& working_area, GrowableBuffer* compressed_output, + CompressionType* result_compression_type) { + Rep* r = rep_.get(); + Status status; - *type = r->compression_type; -#ifndef NDEBUG - if (r->compression_type != kNoCompression && - g_hack_mixed_compression_in_block_based_table.LoadRelaxed() > 0U) { - // If zstd is in the mix, the compression_name table property needs to be - // set to it, for proper handling of context and dictionaries. - assert(!ZSTD_Supported() || r->compression_type == kZSTD); - const auto& compressions = GetSupportedCompressions(); - auto counter = - g_hack_mixed_compression_in_block_based_table.FetchAddRelaxed(1); - *type = compressions[counter % compressions.size()]; - } -#endif // !NDEBUG + UnownedPtr compressor = nullptr; + Decompressor* verify_decomp = nullptr; + if (is_data_block) { + compressor = r->data_block_compressor; + verify_decomp = r->data_block_verify_decompressor.get(); + } else { + compressor = r->index_block_compressor; + verify_decomp = r->verify_decompressor.get(); + } + + compressed_output->Reset(); + CompressionType type = kNoCompression; + if (LIKELY(uncompressed_block_data.size() < kCompressionSizeLimit)) { + if (compressor) { + StopWatchNano timer( + r->ioptions.clock, + ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)); + + size_t max_compressed_size = static_cast( + (static_cast(r->max_compressed_bytes_per_kb) * + uncompressed_block_data.size()) >> + 10); + compressed_output->ResetForSize(max_compressed_size); + status = compressor->CompressBlock( + uncompressed_block_data, compressed_output->data(), + &compressed_output->MutableSize(), &type, &working_area.compress); + + // Post-condition of Compressor::CompressBlock + assert(type == kNoCompression || status.ok()); + assert(type == kNoCompression || + r->table_options.verify_compression == (verify_decomp != nullptr)); - if (is_data_block) { - r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(), - std::memory_order_relaxed); - } - const CompressionDict* compression_dict; - if (!is_data_block || r->compression_dict == nullptr) { - compression_dict = &CompressionDict::GetEmptyDict(); - } else { - compression_dict = r->compression_dict.get(); - } - assert(compression_dict != nullptr); - CompressionInfo compression_info(r->compression_opts, compression_ctx, - *compression_dict, *type, - r->sample_for_compression); - - std::string sampled_output_fast; - std::string sampled_output_slow; - *block_contents = CompressBlock( - uncompressed_block_data, compression_info, type, - r->table_options.format_version, is_data_block /* allow_sample */, - compressed_output, &sampled_output_fast, &sampled_output_slow); - - if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) { - // Currently compression sampling is only enabled for data block. - assert(is_data_block); - r->sampled_input_data_bytes.fetch_add(uncompressed_block_data.size(), - std::memory_order_relaxed); - r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(), - std::memory_order_relaxed); - r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(), - std::memory_order_relaxed); - } - // notify collectors on block add - NotifyCollectTableCollectorsOnBlockAdd( - r->table_properties_collectors, uncompressed_block_data.size(), - sampled_output_fast.size(), sampled_output_slow.size()); - - // Some of the compression algorithms are known to be unreliable. If - // the verify_compression flag is set then try to de-compress the - // compressed data and compare to the input. - if (*type != kNoCompression && r->table_options.verify_compression) { - // Retrieve the uncompressed contents into a new buffer - const UncompressionDict* verify_dict; - if (!is_data_block || r->verify_dict == nullptr) { - verify_dict = &UncompressionDict::GetEmptyDict(); - } else { - verify_dict = r->verify_dict.get(); - } - assert(verify_dict != nullptr); - BlockContents contents; - UncompressionInfo uncompression_info(*verify_ctx, *verify_dict, - r->compression_type); - Status uncompress_status = UncompressBlockData( - uncompression_info, block_contents->data(), block_contents->size(), - &contents, r->table_options.format_version, r->ioptions); - - if (uncompress_status.ok()) { - bool data_match = contents.data.compare(uncompressed_block_data) == 0; - if (!data_match) { - // The result of the compression was invalid. abort. - const char* const msg = - "Decompressed block did not match pre-compression block"; - ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg); - *out_status = Status::Corruption(msg); - *type = kNoCompression; + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::CompressAndVerifyBlock:TamperWithResultType", + &type); + + // Some of the compression algorithms are known to be unreliable. If + // the verify_compression flag is set then try to de-compress the + // compressed data and compare to the input. + if (verify_decomp && type != kNoCompression) { + BlockContents contents; + Status uncompress_status = DecompressBlockData( + compressed_output->data(), compressed_output->size(), type, + *verify_decomp, &contents, r->ioptions, + /*allocator=*/nullptr, &working_area.verify); + + if (LIKELY(uncompress_status.ok())) { + bool data_match = contents.data.compare(uncompressed_block_data) == 0; + if (!data_match) { + // The result of the compression was invalid. abort. + const char* const msg = + "Decompressed block did not match pre-compression block"; + ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg); + status = Status::Corruption(msg); + type = kNoCompression; + } + } else { + // Decompression reported an error. abort. + status = Status::Corruption(std::string("Could not decompress: ") + + uncompress_status.getState()); + type = kNoCompression; } - } else { - // Decompression reported an error. abort. - *out_status = Status::Corruption(std::string("Could not decompress: ") + - uncompress_status.getState()); - *type = kNoCompression; + } + if (timer.IsStarted()) { + RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); } } - if (timer.IsStarted()) { - RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS, - timer.ElapsedNanos()); + if (is_data_block) { + r->compressible_input_data_bytes.FetchAddRelaxed( + uncompressed_block_data.size()); + r->uncompressible_input_data_bytes.FetchAddRelaxed(kBlockTrailerSize); } } else { // Status is not OK, or block is too big to be compressed. if (is_data_block) { - r->uncompressible_input_data_bytes.fetch_add( - uncompressed_block_data.size(), std::memory_order_relaxed); + r->uncompressible_input_data_bytes.FetchAddRelaxed( + uncompressed_block_data.size() + kBlockTrailerSize); } - *type = kNoCompression; - } - if (is_data_block) { - r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize, - std::memory_order_relaxed); } // Abort compression if the block is too big, or did not pass // verification. - if (*type == kNoCompression) { - *block_contents = uncompressed_block_data; + if (type == kNoCompression) { bool compression_attempted = !compressed_output->empty(); RecordTick(r->ioptions.stats, compression_attempted ? NUMBER_BLOCK_COMPRESSION_REJECTED @@ -1369,45 +1997,112 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( uncompressed_block_data.size()); RecordTick(r->ioptions.stats, BYTES_COMPRESSED_TO, compressed_output->size()); + if (r->IsParallelCompressionActive() && is_data_block) { + r->pc_rep->estimated_inflight_size.FetchSubRelaxed( + uncompressed_block_data.size() - compressed_output->size()); + } } + *result_compression_type = type; + return status; } void BlockBasedTableBuilder::WriteMaybeCompressedBlock( const Slice& block_contents, CompressionType comp_type, BlockHandle* handle, - BlockType block_type, const Slice* uncompressed_block_data) { + BlockType block_type, const Slice* uncompressed_block_data, + bool* skip_delta_encoding) { + // Must have pre-checked status in single-threaded context + assert(status().ok()); + assert(io_status().ok()); + rep_->SetIOStatus(WriteMaybeCompressedBlockImpl( + block_contents, comp_type, handle, block_type, uncompressed_block_data, + skip_delta_encoding)); +} + +IOStatus BlockBasedTableBuilder::WriteMaybeCompressedBlockImpl( + const Slice& block_contents, CompressionType comp_type, BlockHandle* handle, + BlockType block_type, const Slice* uncompressed_block_data, + bool* skip_delta_encoding) { // File format contains a sequence of blocks where each block has: // block_data: uint8[n] // compression_type: uint8 // checksum: uint32 - Rep* r = rep_; + Rep* r = rep_.get(); bool is_data_block = block_type == BlockType::kData; + // For data block, skip_delta_encoding must be non null + if (is_data_block) { + assert(skip_delta_encoding != nullptr); + } + if (skip_delta_encoding != nullptr) { + *skip_delta_encoding = false; + } IOOptions io_options; + // Always return io_s for NRVO IOStatus io_s = WritableFileWriter::PrepareIOOptions(r->write_options, io_options); - if (!io_s.ok()) { - r->SetIOStatus(io_s); - return; + if (UNLIKELY(!io_s.ok())) { + return io_s; } // Old, misleading name of this function: WriteRawBlock StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS); - const uint64_t offset = r->get_offset(); + + auto offset = r->get_offset(); + // try to align the data block page to the super alignment size, if enabled + if ((r->table_options.super_block_alignment_size != 0) && is_data_block) { + auto super_block_alignment_mask = + r->table_options.super_block_alignment_size - 1; + if ((r->table_options.super_block_alignment_space_overhead_ratio != 0) && + (offset & (~super_block_alignment_mask)) != + ((offset + block_contents.size()) & + (~super_block_alignment_mask))) { + auto allowed_max_padding_size = + r->table_options.super_block_alignment_size / + r->table_options.super_block_alignment_space_overhead_ratio; + // new block would cross the super block boundary + auto pad_bytes = r->table_options.super_block_alignment_size - + (offset & super_block_alignment_mask); + if (pad_bytes < allowed_max_padding_size) { + io_s = r->file->Pad(io_options, pad_bytes, allowed_max_padding_size); + if (UNLIKELY(!io_s.ok())) { + r->SetIOStatus(io_s); + return io_s; + } + r->pre_compression_size += pad_bytes; + offset += pad_bytes; + r->set_offset(offset); + if (skip_delta_encoding != nullptr) { + // Skip delta encoding in index block builder when a super block + // alignment padding is added for data block. + *skip_delta_encoding = true; + } + TEST_SYNC_POINT( + "BlockBasedTableBuilder::WriteMaybeCompressedBlock:" + "SuperBlockAlignment"); + } else { + TEST_SYNC_POINT( + "BlockBasedTableBuilder::WriteMaybeCompressedBlock:" + "SuperBlockAlignmentPaddingBytesExceedLimit"); + } + } + } + handle->set_offset(offset); handle->set_size(block_contents.size()); - assert(status().ok()); - assert(io_status().ok()); if (uncompressed_block_data == nullptr) { uncompressed_block_data = &block_contents; assert(comp_type == kNoCompression); } + // TODO: consider a variant of this function that puts the trailer after + // block_contents (if it comes from a std::string) so we only need one + // r->file->Append call { io_s = r->file->Append(io_options, block_contents); - if (!io_s.ok()) { - r->SetIOStatus(io_s); - return; + if (UNLIKELY(!io_s.ok())) { + return io_s; } } + r->compression_types_used.Add(comp_type); std::array trailer; trailer[0] = comp_type; uint32_t checksum = ComputeBuiltinChecksumWithLastByte( @@ -1416,10 +2111,10 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( checksum += ChecksumModifierForContext(r->base_context_checksum, offset); if (block_type == BlockType::kFilter) { - Status s = r->filter_builder->MaybePostVerifyFilter(block_contents); - if (!s.ok()) { - r->SetStatus(s); - return; + io_s = status_to_io_status( + r->filter_builder->MaybePostVerifyFilter(block_contents)); + if (UNLIKELY(!io_s.ok())) { + return io_s; } } @@ -1429,36 +2124,21 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( trailer.data()); { io_s = r->file->Append(io_options, Slice(trailer.data(), trailer.size())); - if (!io_s.ok()) { - r->SetIOStatus(io_s); - return; + if UNLIKELY (!io_s.ok()) { + return io_s; } } - { - bool warm_cache; - switch (r->table_options.prepopulate_block_cache) { - case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly: - warm_cache = (r->reason == TableFileCreationReason::kFlush); - break; - case BlockBasedTableOptions::PrepopulateBlockCache::kDisable: - warm_cache = false; - break; - default: - // missing case - assert(false); - warm_cache = false; - } - if (warm_cache) { - Status s = InsertBlockInCacheHelper(*uncompressed_block_data, handle, - block_type); - if (!s.ok()) { - r->SetStatus(s); - return; - } + if (r->warm_cache) { + io_s = status_to_io_status( + InsertBlockInCacheHelper(*uncompressed_block_data, handle, block_type)); + if (UNLIKELY(!io_s.ok())) { + return io_s; } } + r->pre_compression_size += + uncompressed_block_data->size() + kBlockTrailerSize; r->set_offset(r->get_offset() + block_contents.size() + kBlockTrailerSize); if (r->table_options.block_align && is_data_block) { size_t pad_bytes = @@ -1466,109 +2146,93 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) & (r->alignment - 1); - io_s = r->file->Pad(io_options, pad_bytes); - if (io_s.ok()) { + io_s = r->file->Pad(io_options, pad_bytes, kDefaultPageSize); + if (LIKELY(io_s.ok())) { + r->pre_compression_size += pad_bytes; r->set_offset(r->get_offset() + pad_bytes); } else { - r->SetIOStatus(io_s); - return; + return io_s; } } - if (r->IsParallelCompressionEnabled()) { - if (is_data_block) { - r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(), - r->get_offset()); - } else { - r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset()); - } + if (r->IsParallelCompressionActive() && is_data_block) { + r->pc_rep->estimated_inflight_size.FetchSubRelaxed(block_contents.size() + + kBlockTrailerSize); } + return io_s; } -void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() { - Rep* r = rep_; - ParallelCompressionRep::BlockRepSlot* slot = nullptr; - ParallelCompressionRep::BlockRep* block_rep = nullptr; - // Starts empty; see FilterBlockBuilder::AddWithPrevKey - std::string prev_block_last_key_no_ts; - while (r->pc_rep->write_queue.pop(slot)) { - assert(slot != nullptr); - slot->Take(block_rep); - assert(block_rep != nullptr); - if (!block_rep->status.ok()) { - r->SetStatus(block_rep->status); - // Reap block so that blocked Flush() can finish - // if there is one, and Flush() will notice !ok() next time. - block_rep->status = Status::OK(); - r->pc_rep->ReapBlock(block_rep); - continue; - } - - Slice prev_key_no_ts = prev_block_last_key_no_ts; - for (size_t i = 0; i < block_rep->keys->Size(); i++) { - auto& key = (*block_rep->keys)[i]; - if (r->filter_builder != nullptr) { - Slice key_no_ts = ExtractUserKeyAndStripTimestamp(key, r->ts_sz); - r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts); - prev_key_no_ts = key_no_ts; - } - r->index_builder->OnKeyAdded(key); - } - if (r->filter_builder != nullptr) { - prev_block_last_key_no_ts.assign(prev_key_no_ts.data(), - prev_key_no_ts.size()); - } - - r->pc_rep->file_size_estimator.SetCurrBlockUncompSize( - block_rep->data->size()); - WriteMaybeCompressedBlock(block_rep->compressed_contents, - block_rep->compression_type, &r->pending_handle, - BlockType::kData, &block_rep->contents); - if (!ok()) { - break; - } - - r->props.data_size = r->get_offset(); - ++r->props.num_data_blocks; - - if (block_rep->first_key_in_next_block == nullptr) { - r->index_builder->AddIndexEntry(block_rep->keys->Back(), nullptr, - r->pending_handle, - &r->index_separator_scratch); - } else { - Slice first_key_in_next_block = - Slice(*block_rep->first_key_in_next_block); - r->index_builder->AddIndexEntry( - block_rep->keys->Back(), &first_key_in_next_block, r->pending_handle, - &r->index_separator_scratch); - } - - r->pc_rep->ReapBlock(block_rep); +void BlockBasedTableBuilder::MaybeStartParallelCompression() { + if (rep_->compression_parallel_threads <= 1) { + return; } + // Although in theory having a separate thread for writing to the SST file + // could help to hide the latency associated with writing, it is more often + // the case that the latency comes in large units for rare calls to write that + // flush downstream buffers, including in WritableFileWriter. The buffering + // provided by the compression ring buffer is almost negligible for hiding + // that latency. So even with some optimizations, turning on the parallel + // framework when compression is disabled just eats more CPU with little-to-no + // improvement in throughput. + if (!rep_->data_block_compressor) { + // Force the generally best configuration for no compression: no parallelism + return; + } + rep_->pc_rep = std::make_unique( + rep_->compression_parallel_threads); + auto& pc_rep = *rep_->pc_rep; + for (uint32_t i = 0; i <= pc_rep.ring_buffer_mask; i++) { + pc_rep.ring_buffer[i].prepared_index_entry = + rep_->index_builder->CreatePreparedIndexEntry(); + } + pc_rep.worker_threads.reserve(pc_rep.num_worker_threads); + pc_rep.working_areas.resize(pc_rep.num_worker_threads); + for (uint32_t i = 0; i < pc_rep.num_worker_threads; i++) { + auto& wa = pc_rep.working_areas[i]; + if (rep_->data_block_compressor) { + wa.compress = rep_->data_block_compressor->ObtainWorkingArea(); + } + if (rep_->data_block_verify_decompressor) { + wa.verify = rep_->data_block_verify_decompressor->ObtainWorkingArea( + rep_->data_block_compressor->GetPreferredCompressionType()); + } + pc_rep.worker_threads.emplace_back([this, &wa] { BGWorker(wa); }); + } +#ifdef BBTB_PC_WATCHDOG + // Start watchdog thread + pc_rep.watchdog_thread = std::thread([&pc_rep] { pc_rep.BGWatchdog(); }); + pc_rep.live_emit.StoreRelaxed(true); +#endif // BBTB_PC_WATCHDOG } -void BlockBasedTableBuilder::StartParallelCompression() { - rep_->pc_rep.reset( - new ParallelCompressionRep(rep_->compression_opts.parallel_threads)); - rep_->pc_rep->compress_thread_pool.reserve( - rep_->compression_opts.parallel_threads); - for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) { - rep_->pc_rep->compress_thread_pool.emplace_back([this, i] { - BGWorkCompression(*(rep_->compression_ctxs[i]), - rep_->verify_ctxs[i].get()); - }); - } - rep_->pc_rep->write_thread.reset( - new port::Thread([this] { BGWorkWriteMaybeCompressedBlock(); })); -} - -void BlockBasedTableBuilder::StopParallelCompression() { - rep_->pc_rep->compress_queue.finish(); - for (auto& thread : rep_->pc_rep->compress_thread_pool) { +void BlockBasedTableBuilder::StopParallelCompression(bool abort) { + auto& pc_rep = *rep_->pc_rep; + if (abort) { + pc_rep.SetAbort(pc_rep.emit_thread_state); + } else if (pc_rep.emit_thread_state != + ParallelCompressionRep::ThreadState::kEnd) { + // In case we didn't do a final flush with no next key, which might have + // been skipped if !ok() was set after the start of Finish() + assert(rep_->props.num_data_blocks == 0 || !ok()); + pc_rep.SetNoMoreToEmit(pc_rep.emit_thread_state, pc_rep.emit_slot); + } +#ifdef BBTB_PC_WATCHDOG + pc_rep.live_emit.StoreRelaxed(false); +#endif // BBTB_PC_WATCHDOG + assert(pc_rep.emit_thread_state == ParallelCompressionRep::ThreadState::kEnd); + for (auto& thread : pc_rep.worker_threads) { thread.join(); } - rep_->pc_rep->write_queue.finish(); - rep_->pc_rep->write_thread->join(); +#ifdef BBTB_PC_WATCHDOG + // Wake & shutdown watchdog thread + { + std::unique_lock lock(pc_rep.watchdog_mutex); + pc_rep.shutdown_watchdog = true; + pc_rep.watchdog_cv.notify_all(); + } + pc_rep.watchdog_thread.join(); +#endif // BBTB_PC_WATCHDOG + rep_->pc_rep.reset(); } Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); } @@ -1577,6 +2241,8 @@ IOStatus BlockBasedTableBuilder::io_status() const { return rep_->GetIOStatus(); } +bool BlockBasedTableBuilder::ok() const { return rep_->StatusOk(); } + Status BlockBasedTableBuilder::InsertBlockInCacheHelper( const Slice& block_contents, const BlockHandle* handle, BlockType block_type) { @@ -1587,11 +2253,15 @@ Status BlockBasedTableBuilder::InsertBlockInCacheHelper( if (block_cache && helper && helper->create_cb) { CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle); size_t charge; + // NOTE: data blocks (and everything else) will be warmed in decompressed + // state, so does not need a dictionary-aware decompressor. The only thing + // needing a decompressor here (in create_context) is warming the + // (de)compression dictionary, which will clone and save a dict-based + // decompressor from the corresponding non-dict decompressor. s = WarmInCache(block_cache, key.AsSlice(), block_contents, &rep_->create_context, helper, Cache::Priority::LOW, &charge); - - if (s.ok()) { + if (LIKELY(s.ok())) { BlockBasedTable::UpdateCacheInsertionMetrics( block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(), rep_->ioptions.stats); @@ -1617,11 +2287,11 @@ void BlockBasedTableBuilder::WriteFilterBlock( } BlockHandle filter_block_handle; bool is_partitioned_filter = rep_->table_options.partition_filters; - if (ok()) { + if (LIKELY(ok())) { rep_->props.num_filter_entries += rep_->filter_builder->EstimateEntriesAdded(); Status s = Status::Incomplete(); - while (ok() && s.IsIncomplete()) { + while (LIKELY(ok()) && s.IsIncomplete()) { // filter_data is used to store the transferred filter data payload from // FilterBlockBuilder and deallocate the payload by going out of scope. // Otherwise, the payload will unnecessarily remain until @@ -1651,7 +2321,7 @@ void BlockBasedTableBuilder::WriteFilterBlock( } rep_->filter_builder->ResetFilterBitsBuilder(); } - if (ok()) { + if (LIKELY(ok())) { // Add mapping from ".Name" to location // of filter data. std::string key; @@ -1664,30 +2334,37 @@ void BlockBasedTableBuilder::WriteFilterBlock( void BlockBasedTableBuilder::WriteIndexBlock( MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) { - if (!ok()) { + if (UNLIKELY(!ok())) { return; } IndexBuilder::IndexBlocks index_blocks; auto index_builder_status = rep_->index_builder->Finish(&index_blocks); - if (index_builder_status.IsIncomplete()) { - // We we have more than one index partition then meta_blocks are not - // supported for the index. Currently meta_blocks are used only by - // HashIndexBuilder which is not multi-partition. - assert(index_blocks.meta_blocks.empty()); - } else if (ok() && !index_builder_status.ok()) { + if (LIKELY(ok()) && !index_builder_status.ok() && + !index_builder_status.IsIncomplete()) { + // If the index builder failed for non-Incomplete errors, we should + // mark the entire builder as having failed wit that status. However, + // If the index builder failed with an incomplete error, we should + // continue writing out any meta blocks that may have been generated. rep_->SetStatus(index_builder_status); } - if (ok()) { + + if (LIKELY(ok())) { for (const auto& item : index_blocks.meta_blocks) { BlockHandle block_handle; - WriteBlock(item.second, &block_handle, BlockType::kIndex); - if (!ok()) { + if (item.second.first == BlockType::kIndex) { + WriteBlock(item.second.second, &block_handle, item.second.first); + } else { + assert(item.second.first == BlockType::kUserDefinedIndex); + WriteMaybeCompressedBlock(item.second.second, kNoCompression, + &block_handle, item.second.first); + } + if (UNLIKELY(!ok())) { break; } meta_index_builder->Add(item.first, block_handle); } } - if (ok()) { + if (LIKELY(ok())) { if (rep_->table_options.enable_index_compression) { WriteBlock(index_blocks.index_block_contents, index_block_handle, BlockType::kIndex); @@ -1700,7 +2377,7 @@ void BlockBasedTableBuilder::WriteIndexBlock( // If there are more index partitions, finish them and write them out if (index_builder_status.IsIncomplete()) { bool index_building_finished = false; - while (ok() && !index_building_finished) { + while (LIKELY(ok()) && !index_building_finished) { Status s = rep_->index_builder->Finish(&index_blocks, *index_block_handle); if (s.ok()) { @@ -1726,8 +2403,8 @@ void BlockBasedTableBuilder::WriteIndexBlock( } } // If success and need to record in metaindex rather than footer... - if (!FormatVersionUsesIndexHandleInFooter( - rep_->table_options.format_version)) { + if (LIKELY(ok()) && !FormatVersionUsesIndexHandleInFooter( + rep_->table_options.format_version)) { meta_index_builder->Add(kIndexBlockName, *index_block_handle); } } @@ -1735,7 +2412,7 @@ void BlockBasedTableBuilder::WriteIndexBlock( void BlockBasedTableBuilder::WritePropertiesBlock( MetaIndexBuilder* meta_index_builder) { BlockHandle properties_block_handle; - if (ok()) { + if (LIKELY(ok())) { PropertyBlockBuilder property_block_builder; rep_->props.filter_policy_name = rep_->table_options.filter_policy != nullptr @@ -1750,10 +2427,6 @@ void BlockBasedTableBuilder::WritePropertiesBlock( rep_->ioptions.merge_operator != nullptr ? rep_->ioptions.merge_operator->Name() : "nullptr"; - rep_->props.compression_name = - CompressionTypeToString(rep_->compression_type); - rep_->props.compression_options = - CompressionOptionsToString(rep_->compression_opts); rep_->props.prefix_extractor_name = rep_->prefix_extractor ? rep_->prefix_extractor->AsString() : "nullptr"; std::string property_collectors_names = "["; @@ -1767,37 +2440,42 @@ void BlockBasedTableBuilder::WritePropertiesBlock( } property_collectors_names += "]"; rep_->props.property_collectors_names = property_collectors_names; + + rep_->PostPopulateCompressionProperties(); + if (rep_->table_options.index_type == BlockBasedTableOptions::kTwoLevelIndexSearch) { assert(rep_->p_index_builder_ != nullptr); rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions(); rep_->props.top_level_index_size = - rep_->p_index_builder_->TopLevelIndexSize(rep_->offset); + rep_->p_index_builder_->TopLevelIndexSize(rep_->offset.LoadRelaxed()); } rep_->props.index_key_is_user_key = - !rep_->index_builder->seperator_is_key_plus_seq(); + !rep_->index_builder->separator_is_key_plus_seq(); rep_->props.index_value_is_delta_encoded = rep_->use_delta_encoding_for_index_values; - if (rep_->sampled_input_data_bytes > 0) { + if (rep_->sampled_input_data_bytes.LoadRelaxed() > 0) { rep_->props.slow_compression_estimated_data_size = static_cast( - static_cast(rep_->sampled_output_slow_data_bytes) / - rep_->sampled_input_data_bytes * - rep_->compressible_input_data_bytes + - rep_->uncompressible_input_data_bytes + 0.5); + static_cast( + rep_->sampled_output_slow_data_bytes.LoadRelaxed()) / + rep_->sampled_input_data_bytes.LoadRelaxed() * + rep_->compressible_input_data_bytes.LoadRelaxed() + + rep_->uncompressible_input_data_bytes.LoadRelaxed() + 0.5); rep_->props.fast_compression_estimated_data_size = static_cast( - static_cast(rep_->sampled_output_fast_data_bytes) / - rep_->sampled_input_data_bytes * - rep_->compressible_input_data_bytes + - rep_->uncompressible_input_data_bytes + 0.5); + static_cast( + rep_->sampled_output_fast_data_bytes.LoadRelaxed()) / + rep_->sampled_input_data_bytes.LoadRelaxed() * + rep_->compressible_input_data_bytes.LoadRelaxed() + + rep_->uncompressible_input_data_bytes.LoadRelaxed() + 0.5); } else if (rep_->sample_for_compression > 0) { - // We tried to sample but none were found. Assume worst-case (compression - // ratio 1.0) so data is complete and aggregatable. + // We tried to sample but none were found. Assume worst-case + // (compression ratio 1.0) so data is complete and aggregatable. rep_->props.slow_compression_estimated_data_size = - rep_->compressible_input_data_bytes + - rep_->uncompressible_input_data_bytes; + rep_->compressible_input_data_bytes.LoadRelaxed() + + rep_->uncompressible_input_data_bytes.LoadRelaxed(); rep_->props.fast_compression_estimated_data_size = - rep_->compressible_input_data_bytes + - rep_->uncompressible_input_data_bytes; + rep_->compressible_input_data_bytes.LoadRelaxed() + + rep_->uncompressible_input_data_bytes.LoadRelaxed(); } rep_->props.user_defined_timestamps_persisted = rep_->persist_user_defined_timestamps; @@ -1818,7 +2496,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock( WriteMaybeCompressedBlock(block_data, kNoCompression, &properties_block_handle, BlockType::kProperties); } - if (ok()) { + if (LIKELY(ok())) { #ifndef NDEBUG { uint64_t props_block_offset = properties_block_handle.offset(); @@ -1842,21 +2520,21 @@ void BlockBasedTableBuilder::WritePropertiesBlock( void BlockBasedTableBuilder::WriteCompressionDictBlock( MetaIndexBuilder* meta_index_builder) { - if (rep_->compression_dict != nullptr && - rep_->compression_dict->GetRawDict().size()) { + Slice compression_dict; + if (rep_->data_block_compressor) { + compression_dict = rep_->data_block_compressor->GetSerializedDict(); + } + if (!compression_dict.empty()) { BlockHandle compression_dict_block_handle; - if (ok()) { - WriteMaybeCompressedBlock(rep_->compression_dict->GetRawDict(), - kNoCompression, &compression_dict_block_handle, + if (LIKELY(ok())) { + WriteMaybeCompressedBlock(compression_dict, kNoCompression, + &compression_dict_block_handle, BlockType::kCompressionDictionary); -#ifndef NDEBUG - Slice compression_dict = rep_->compression_dict->GetRawDict(); TEST_SYNC_POINT_CALLBACK( "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", &compression_dict); -#endif // NDEBUG } - if (ok()) { + if (LIKELY(ok())) { meta_index_builder->Add(kCompressionDictBlockName, compression_dict_block_handle); } @@ -1865,7 +2543,7 @@ void BlockBasedTableBuilder::WriteCompressionDictBlock( void BlockBasedTableBuilder::WriteRangeDelBlock( MetaIndexBuilder* meta_index_builder) { - if (ok() && !rep_->range_del_block.empty()) { + if (LIKELY(ok()) && !rep_->range_del_block.empty()) { BlockHandle range_del_block_handle; WriteMaybeCompressedBlock(rep_->range_del_block.Finish(), kNoCompression, &range_del_block_handle, @@ -1876,11 +2554,8 @@ void BlockBasedTableBuilder::WriteRangeDelBlock( void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, BlockHandle& index_block_handle) { - assert(ok()); - Rep* r = rep_; - // this is guaranteed by BlockBasedTableBuilder's constructor - assert(r->table_options.checksum == kCRC32c || - r->table_options.format_version != 0); + assert(LIKELY(ok())); + Rep* r = rep_.get(); FooterBuilder footer; Status s = footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version, r->get_offset(), @@ -1899,30 +2574,56 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, } ios = r->file->Append(io_options, footer.GetSlice()); if (ios.ok()) { + r->pre_compression_size += footer.GetSlice().size(); r->set_offset(r->get_offset() + footer.GetSlice().size()); } else { r->SetIOStatus(ios); } } -void BlockBasedTableBuilder::EnterUnbuffered() { - Rep* r = rep_; +void BlockBasedTableBuilder::MaybeEnterUnbuffered( + const Slice* first_key_in_next_block) { + Rep* r = rep_.get(); assert(r->state == Rep::State::kBuffered); + // Don't yet enter unbuffered (early return) if none of the conditions are + // met + if (first_key_in_next_block != nullptr) { + bool exceeds_buffer_limit = + (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit); + if (!exceeds_buffer_limit) { + bool exceeds_global_block_cache_limit = false; + // Increase cache charging for the last buffered data block + // only if the block is not going to be unbuffered immediately + // and there exists a cache reservation manager + if (r->compression_dict_buffer_cache_res_mgr != nullptr) { + Status s = + r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation( + r->data_begin_offset); + exceeds_global_block_cache_limit = s.IsMemoryLimit(); + } + if (!exceeds_global_block_cache_limit) { + return; + } + } + } + + // Enter Unbuffered state r->state = Rep::State::kUnbuffered; - const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0 - ? r->compression_opts.zstd_max_train_bytes - : r->compression_opts.max_dict_bytes; const size_t kNumBlocksBuffered = r->data_block_buffers.size(); if (kNumBlocksBuffered == 0) { // The below code is neither safe nor necessary for handling zero data // blocks. + // For PostPopulateCompressionProperties() + assert(!r->data_block_compressor); + r->data_block_compressor = r->basic_compressor.get(); return; } // Abstract algebra teaches us that a finite cyclic group (such as the // additive group of integers modulo N) can be generated by a number that is // coprime with N. Since N is variable (number of buffered data blocks), we - // must then pick a prime number in order to guarantee coprimeness with any N. + // must then pick a prime number in order to guarantee coprimeness with any + // N. // // One downside of this approach is the spread will be poor when // `kPrimeGeneratorRemainder` is close to zero or close to @@ -1936,17 +2637,20 @@ void BlockBasedTableBuilder::EnterUnbuffered() { kPrimeGenerator % static_cast(kNumBlocksBuffered)); const size_t kInitSampleIdx = kNumBlocksBuffered / 2; - std::string compression_dict_samples; - std::vector compression_dict_sample_lens; + Compressor::DictSamples samples; size_t buffer_idx = kInitSampleIdx; + // Get max_sample_bytes from the DictSampling guidance + auto* sampling = + std::get_if(&r->data_block_dict_guidance); + assert(sampling != nullptr); + size_t max_sample_bytes = sampling->max_sample_bytes; for (size_t i = 0; - i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes; + i < kNumBlocksBuffered && samples.sample_data.size() < max_sample_bytes; ++i) { - size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(), + size_t copy_len = std::min(max_sample_bytes - samples.sample_data.size(), r->data_block_buffers[buffer_idx].size()); - compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0, - copy_len); - compression_dict_sample_lens.emplace_back(copy_len); + samples.sample_data.append(r->data_block_buffers[buffer_idx], 0, copy_len); + samples.sample_lens.emplace_back(copy_len); buffer_idx += kPrimeGeneratorRemainder; if (buffer_idx >= kNumBlocksBuffered) { @@ -1954,26 +2658,36 @@ void BlockBasedTableBuilder::EnterUnbuffered() { } } - // final data block flushed, now we can generate dictionary from the samples. - // OK if compression_dict_samples is empty, we'll just get empty dictionary. - std::string dict; - if (r->compression_opts.zstd_max_train_bytes > 0) { - if (r->compression_opts.use_zstd_dict_trainer) { - dict = ZSTD_TrainDictionary(compression_dict_samples, - compression_dict_sample_lens, - r->compression_opts.max_dict_bytes); + assert(samples.sample_data.size() > 0); + + // final sample data block flushed, now we can generate dictionary (or it + // might opt not to use a dictionary and that's ok) + r->data_block_compressor = + MaybeCloneSpecialized(r->basic_compressor.get(), + CacheEntryRole::kDataBlock, std::move(samples)); + + Slice serialized_dict = r->data_block_compressor->GetSerializedDict(); + if (r->verify_decompressor) { + if (serialized_dict.empty()) { + // No dictionary + r->data_block_verify_decompressor = r->verify_decompressor.get(); } else { - dict = ZSTD_FinalizeDictionary( - compression_dict_samples, compression_dict_sample_lens, - r->compression_opts.max_dict_bytes, r->compression_opts.level); + // Get an updated dictionary-aware decompressor for verification. + Status s = r->verify_decompressor->MaybeCloneForDict( + serialized_dict, &r->verify_decompressor_with_dict); + // Dictionary support must be present on the decompressor side if it's + // on the compressor side. + assert(r->verify_decompressor_with_dict); + if (r->verify_decompressor_with_dict) { + r->data_block_verify_decompressor = + r->verify_decompressor_with_dict.get(); + assert(s.ok()); + } else { + assert(!s.ok()); + r->SetStatus(s); + } } - } else { - dict = std::move(compression_dict_samples); } - r->compression_dict.reset(new CompressionDict(dict, r->compression_type, - r->compression_opts.level)); - r->verify_dict.reset( - new UncompressionDict(dict, r->compression_type == kZSTD)); auto get_iterator_for_block = [&r](size_t i) { auto& data_block = r->data_block_buffers[i]; @@ -1998,59 +2712,37 @@ void BlockBasedTableBuilder::EnterUnbuffered() { assert(iter != nullptr); }; + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (r->filter_builder != nullptr) { + // NOTE: AddWithPrevKey here would only save key copying if prev is + // pinned (iter->IsKeyPinned()), which is probably rare with delta + // encoding. OK to go from Add() here to AddWithPrevKey() in + // unbuffered operation. + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); + } + r->index_builder->OnKeyAdded(key, iter->value()); + } + + Slice first_key_in_loop_next_block; + const Slice* first_key_in_loop_next_block_ptr; if (i + 1 < r->data_block_buffers.size()) { next_block_iter = get_iterator_for_block(i + 1); + first_key_in_loop_next_block = next_block_iter->key(); + first_key_in_loop_next_block_ptr = &first_key_in_loop_next_block; + } else { + first_key_in_loop_next_block_ptr = first_key_in_next_block; } auto& data_block = r->data_block_buffers[i]; - if (r->IsParallelCompressionEnabled()) { - Slice first_key_in_next_block; - const Slice* first_key_in_next_block_ptr = &first_key_in_next_block; - if (i + 1 < r->data_block_buffers.size()) { - assert(next_block_iter != nullptr); - first_key_in_next_block = next_block_iter->key(); - } else { - first_key_in_next_block_ptr = r->first_key_in_next_block; - } - - std::vector keys; - for (; iter->Valid(); iter->Next()) { - keys.emplace_back(iter->key().ToString()); - } - - ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock( - r->compression_type, first_key_in_next_block_ptr, &data_block, &keys); + iter->SeekToLast(); + assert(iter->Valid()); + if (r->IsParallelCompressionActive()) { + EmitBlockForParallel(data_block, iter->key(), + first_key_in_loop_next_block_ptr); - assert(block_rep != nullptr); - r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(), - r->get_offset()); - r->pc_rep->EmitBlock(block_rep); } else { - for (; iter->Valid(); iter->Next()) { - Slice key = iter->key(); - if (r->filter_builder != nullptr) { - // NOTE: AddWithPrevKey here would only save key copying if prev is - // pinned (iter->IsKeyPinned()), which is probably rare with delta - // encoding. OK to go from Add() here to AddWithPrevKey() in - // unbuffered operation. - r->filter_builder->Add( - ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); - } - r->index_builder->OnKeyAdded(key); - } - WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData); - if (ok() && i + 1 < r->data_block_buffers.size()) { - assert(next_block_iter != nullptr); - Slice first_key_in_next_block = next_block_iter->key(); - - Slice* first_key_in_next_block_ptr = &first_key_in_next_block; - - iter->SeekToLast(); - assert(iter->Valid()); - r->index_builder->AddIndexEntry( - iter->key(), first_key_in_next_block_ptr, r->pending_handle, - &r->index_separator_scratch); - } + EmitBlock(data_block, iter->key(), first_key_in_loop_next_block_ptr); } std::swap(iter, next_block_iter); } @@ -2065,32 +2757,36 @@ void BlockBasedTableBuilder::EnterUnbuffered() { } Status BlockBasedTableBuilder::Finish() { - Rep* r = rep_; + Rep* r = rep_.get(); assert(r->state != Rep::State::kClosed); - bool empty_data_block = r->data_block.empty(); - r->first_key_in_next_block = nullptr; - Flush(); - if (r->state == Rep::State::kBuffered) { - EnterUnbuffered(); - } - if (r->IsParallelCompressionEnabled()) { - StopParallelCompression(); + #ifndef NDEBUG - for (const auto& br : r->pc_rep->block_rep_buf) { - assert(br.status.ok()); + { + // This sync point callback is a simple approximation of a failure detected + // in parallel compression after the start of calling Finish() but before + // Finish() calls Flush() + IOStatus s = rep_->GetIOStatus(); + TEST_SYNC_POINT_CALLBACK("BlockBasedTableBuilder::Finish:ParallelIOStatus", + &s); + if (!s.ok()) { + rep_->SetIOStatus(s); } + } #endif // !NDEBUG - } else { - // To make sure properties block is able to keep the accurate size of index - // block, we will finish writing all index entries first. - if (ok() && !empty_data_block) { - r->index_builder->AddIndexEntry( - r->last_ikey, nullptr /* no next data block */, r->pending_handle, - &r->index_separator_scratch); - } + // To make sure properties block is able to keep the accurate size of index + // block, we will finish writing all index entries first, in Flush(). + Flush(/*first_key_in_next_block=*/nullptr); + if (rep_->state == Rep::State::kBuffered) { + MaybeEnterUnbuffered(nullptr); + } + assert(r->state == Rep::State::kUnbuffered); + if (r->IsParallelCompressionActive()) { + StopParallelCompression(/*abort=*/false); } - r->props.tail_start_offset = r->offset; + r->props.tail_start_offset = r->offset.LoadRelaxed(); + + uint64_t last_estimated_tail_size = EstimatedTailSize(); // Write meta blocks, metaindex block and footer in the following order. // 1. [meta block: filter] @@ -2107,36 +2803,45 @@ Status BlockBasedTableBuilder::Finish() { WriteCompressionDictBlock(&meta_index_builder); WriteRangeDelBlock(&meta_index_builder); WritePropertiesBlock(&meta_index_builder); - if (ok()) { + if (LIKELY(ok())) { // flush the meta index block WriteMaybeCompressedBlock(meta_index_builder.Finish(), kNoCompression, &metaindex_block_handle, BlockType::kMetaIndex); } - if (ok()) { + if (LIKELY(ok())) { WriteFooter(metaindex_block_handle, index_block_handle); } r->state = Rep::State::kClosed; - r->tail_size = r->offset - r->props.tail_start_offset; - - Status ret_status = r->CopyStatus(); - IOStatus ios = r->GetIOStatus(); - if (!ios.ok() && ret_status.ok()) { - // Let io_status supersede ok status (otherwise status takes precedennce) - ret_status = ios; - } - return ret_status; + r->tail_size = r->offset.LoadRelaxed() - r->props.tail_start_offset; + + // Assert tail size estimation is an overestimate only when tail size + // estimation option is enabled for compaction files with supported + // index/filter types: + // - Shortened indexes (kBinarySearch, kBinarySearchWithFirstKey) + // - Partitioned indexes (kTwoLevelIndexSearch) + // - Full filters + // - Partitioned filters + if (r->target_file_size_is_upper_bound && + r->reason == TableFileCreationReason::kCompaction && + r->table_options.index_type != BlockBasedTableOptions::kHashSearch) { + ROCKS_LOG_WARN(r->ioptions.info_log, + "File number: %" PRIu64 ", Estimated tail size = %" PRIu64 + " bytes, Actual tail size = %" PRIu64 " bytes", + r->props.orig_file_number, last_estimated_tail_size, + r->tail_size); + assert(r->tail_size <= last_estimated_tail_size); + } + + return r->GetStatus(); } void BlockBasedTableBuilder::Abandon() { assert(rep_->state != Rep::State::kClosed); - if (rep_->IsParallelCompressionEnabled()) { - StopParallelCompression(); + if (rep_->IsParallelCompressionActive()) { + StopParallelCompression(/*abort=*/true); } rep_->state = Rep::State::kClosed; -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED // Avoid unnecessary lock acquisition - rep_->CopyStatus().PermitUncheckedError(); - rep_->CopyIOStatus().PermitUncheckedError(); -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + rep_->GetIOStatus().PermitUncheckedError(); } uint64_t BlockBasedTableBuilder::NumEntries() const { @@ -2147,18 +2852,66 @@ bool BlockBasedTableBuilder::IsEmpty() const { return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0; } -uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; } +uint64_t BlockBasedTableBuilder::PreCompressionSize() const { + return rep_->pre_compression_size; +} + +uint64_t BlockBasedTableBuilder::FileSize() const { + return rep_->offset.LoadRelaxed(); +} uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { - if (rep_->IsParallelCompressionEnabled()) { - // Use compression ratio so far and inflight uncompressed bytes to estimate - // final SST size. - return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize(); + if (rep_->IsParallelCompressionActive()) { + // Use upper bound on "inflight" data size to estimate + return FileSize() + rep_->pc_rep->estimated_inflight_size.LoadRelaxed(); } else { return FileSize(); } } +uint64_t BlockBasedTableBuilder::EstimatedTailSize() const { + uint64_t estimated_tail_size = 0; + + // 1. Estimate index size + if (rep_->table_options.index_type == + BlockBasedTableOptions::kTwoLevelIndexSearch) { + assert(rep_->p_index_builder_); + estimated_tail_size += rep_->p_index_builder_->CurrentIndexSizeEstimate(); + } else { + assert(rep_->index_builder); + estimated_tail_size += rep_->index_builder->CurrentIndexSizeEstimate(); + } + + // 2. Estimate filter size + if (rep_->filter_builder) { + estimated_tail_size += rep_->filter_builder->CurrentFilterSizeEstimate(); + } + + // 3. Estimate compression dictionary size + if (rep_->data_block_compressor) { + Slice dict = rep_->data_block_compressor->GetSerializedDict(); + if (!dict.empty()) { + estimated_tail_size += dict.size(); + } + } + + // 4. Estimate range deletion block size + if (!rep_->range_del_block.empty()) { + estimated_tail_size += rep_->range_del_block.CurrentSizeEstimate(); + } + + // 5. Estimate properties block size conservatively (~1-2KB) + estimated_tail_size += 2048; + + // 6. Estimate meta-index block size conservatively (~1KB) + estimated_tail_size += 1024; + + // 7. Add footer size + estimated_tail_size += Footer::kMaxEncodedLength; + + return estimated_tail_size; +} + uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; } bool BlockBasedTableBuilder::NeedCompact() const { @@ -2201,7 +2954,4 @@ const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter."; const std::string BlockBasedTable::kPartitionedFilterBlockPrefix = "partitionedfilter."; -#ifndef NDEBUG -RelaxedAtomic g_hack_mixed_compression_in_block_based_table{0}; -#endif // !NDEBUG } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 61f5ad78e5a5..0988f2b959ae 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -35,7 +35,6 @@ class WritableFile; struct BlockBasedTableOptions; extern const uint64_t kBlockBasedTableMagicNumber; -extern const uint64_t kLegacyBlockBasedTableMagicNumber; class BlockBasedTableBuilder : public TableBuilder { public: @@ -83,15 +82,21 @@ class BlockBasedTableBuilder : public TableBuilder { bool IsEmpty() const override; + uint64_t PreCompressionSize() const override; + // Size of the file generated so far. If invoked after a successful // Finish() call, returns the size of the final generated file. uint64_t FileSize() const override; - // Estimated size of the file generated so far. This is used when - // FileSize() cannot estimate final SST size, e.g. parallel compression - // is enabled. + // Estimated size of the file generated so far (based on data blocks, this + // estimate does not include meta blocks). This is used when FileSize() cannot + // estimate final SST size, e.g. parallel compression is enabled. uint64_t EstimatedFileSize() const override; + // Estimated tail size of the SST file generated so far. The "tail" refers to + // all blocks written after data blocks (index + filter). + uint64_t EstimatedTailSize() const override; + // Get the size of the "tail" part of a SST file. "Tail" refers to // all blocks after data blocks till the end of the SST file. uint64_t GetTailSize() const override; @@ -110,27 +115,41 @@ class BlockBasedTableBuilder : public TableBuilder { void SetSeqnoTimeTableProperties(const SeqnoToTimeMapping& relevant_mapping, uint64_t oldest_ancestor_time) override; + uint64_t GetWorkerCPUMicros() const override; + private: - bool ok() const { return status().ok(); } + bool ok() const; - // Transition state from buffered to unbuffered. See `Rep::State` API comment - // for details of the states. + // Transition state from buffered to unbuffered if the conditions are met. See + // `Rep::State` API comment for details of the states. // REQUIRES: `rep_->state == kBuffered` - void EnterUnbuffered(); - - // Call block's Finish() method and then - // - in buffered mode, buffer the uncompressed block contents. - // - in unbuffered mode, write the compressed block contents to file. - void WriteBlock(BlockBuilder* block, BlockHandle* handle, - BlockType blocktype); - - // Compress and write block content to the file. + void MaybeEnterUnbuffered(const Slice* first_key_in_next_block); + + // Try to keep some parallel-specific code separate to improve hot code + // locality for non-parallel case + void EmitBlock(std::string& uncompressed, + const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block); + void EmitBlockForParallel(std::string& uncompressed, + const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block); + + // Compress and write block content to the file, from a single-threaded + // context + // @skip_delta_encoding : This is set to non null for data blocks, so that + // caller would know whether the index entry of this data block should + // skip delta encoding or not void WriteBlock(const Slice& block_contents, BlockHandle* handle, - BlockType block_type); + BlockType block_type, bool* skip_delta_encoding = nullptr); // Directly write data to the file. - void WriteMaybeCompressedBlock( + void WriteMaybeCompressedBlock(const Slice& block_contents, CompressionType, + BlockHandle* handle, BlockType block_type, + const Slice* uncompressed_block_data = nullptr, + bool* skip_delta_encoding = nullptr); + IOStatus WriteMaybeCompressedBlockImpl( const Slice& block_contents, CompressionType, BlockHandle* handle, - BlockType block_type, const Slice* uncompressed_block_data = nullptr); + BlockType block_type, const Slice* uncompressed_block_data = nullptr, + bool* skip_delta_encoding = nullptr); void SetupCacheKeyPrefix(const TableBuilderOptions& tbo); @@ -158,58 +177,38 @@ class BlockBasedTableBuilder : public TableBuilder { struct Rep; class BlockBasedTablePropertiesCollectorFactory; class BlockBasedTablePropertiesCollector; - Rep* rep_; - + std::unique_ptr rep_; + struct WorkingAreaPair; struct ParallelCompressionRep; // Advanced operation: flush any buffered key/value pairs to file. // Can be used to ensure that two adjacent entries never live in // the same data block. Most clients should not need to use this method. // REQUIRES: Finish(), Abandon() have not been called - void Flush(); + void Flush(const Slice* first_key_in_next_block); // Some compression libraries fail when the uncompressed size is bigger than // int. If uncompressed size is bigger than kCompressionSizeLimit, don't // compress it const uint64_t kCompressionSizeLimit = std::numeric_limits::max(); - // Get blocks from mem-table walking thread, compress them and - // pass them to the write thread. Used in parallel compression mode only - void BGWorkCompression(const CompressionContext& compression_ctx, - UncompressionContext* verify_ctx); + // Code for a "parallel compression" worker thread, which can really do SST + // writes and block compressions alternately. + void BGWorker(WorkingAreaPair& working_area); // Given uncompressed block content, try to compress it and return result and // compression type - void CompressAndVerifyBlock(const Slice& uncompressed_block_data, - bool is_data_block, - const CompressionContext& compression_ctx, - UncompressionContext* verify_ctx, - std::string* compressed_output, - Slice* result_block_contents, - CompressionType* result_compression_type, - Status* out_status); - - // Get compressed blocks from BGWorkCompression and write them into SST - void BGWorkWriteMaybeCompressedBlock(); - - // Initialize parallel compression context and - // start BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads - void StartParallelCompression(); - - // Stop BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads - void StopParallelCompression(); -}; + Status CompressAndVerifyBlock(const Slice& uncompressed_block_data, + bool is_data_block, + WorkingAreaPair& working_area, + GrowableBuffer* compressed_output, + CompressionType* result_compression_type); -Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, - CompressionType* type, uint32_t format_version, - bool do_sample, std::string* compressed_output, - std::string* sampled_output_fast, - std::string* sampled_output_slow); - -#ifndef NDEBUG -// 0 == disable the hack -// > 0 => counter for rotating through compression types -extern RelaxedAtomic g_hack_mixed_compression_in_block_based_table; -#endif + // If configured, start worker threads for parallel compression + void MaybeStartParallelCompression(); + + // Stop worker threads for parallel compression + void StopParallelCompression(bool abort); +}; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 7add9fb16fcb..f90e95f36a06 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -20,11 +20,14 @@ #include "options/options_helper.h" #include "port/port.h" #include "rocksdb/cache.h" +#include "rocksdb/comparator.h" #include "rocksdb/convenience.h" #include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/table.h" +#include "rocksdb/user_defined_index.h" +#include "rocksdb/utilities/customizable_util.h" #include "rocksdb/utilities/options_type.h" #include "table/block_based/block_based_table_builder.h" #include "table/block_based/block_based_table_reader.h" @@ -182,6 +185,12 @@ static std::unordered_map {"kBinarySearchWithFirstKey", BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}}; +static std::unordered_map + block_base_table_index_search_type_string_map = { + {"kBinary", BlockBasedTableOptions::BlockSearchType::kBinary}, + {"kInterpolation", + BlockBasedTableOptions::BlockSearchType::kInterpolation}}; + static std::unordered_map block_base_table_data_block_index_type_string_map = { @@ -259,6 +268,10 @@ static struct BlockBasedTableTypeInfo { {"index_type", OptionTypeInfo::Enum( offsetof(struct BlockBasedTableOptions, index_type), &block_base_table_index_type_string_map)}, + {"index_block_search_type", + OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, index_block_search_type), + &block_base_table_index_search_type_string_map)}, {"hash_index_allow_collision", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated}}, {"data_block_index_type", @@ -312,6 +325,11 @@ static struct BlockBasedTableTypeInfo { OptionTypeInfo::AsCustomSharedPtr( offsetof(struct BlockBasedTableOptions, filter_policy), OptionVerificationType::kByNameAllowFromNull)}, + {"user_defined_index_factory", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct BlockBasedTableOptions, + user_defined_index_factory), + OptionVerificationType::kByNameAllowFromNull)}, {"whole_key_filtering", {offsetof(struct BlockBasedTableOptions, whole_key_filtering), OptionType::kBoolean, OptionVerificationType::kNormal}}, @@ -357,6 +375,13 @@ static struct BlockBasedTableTypeInfo { {"block_align", {offsetof(struct BlockBasedTableOptions, block_align), OptionType::kBoolean, OptionVerificationType::kNormal}}, + {"super_block_alignment_size", + {offsetof(struct BlockBasedTableOptions, super_block_alignment_size), + OptionType::kSizeT, OptionVerificationType::kNormal}}, + {"super_block_alignment_space_overhead_ratio", + {offsetof(struct BlockBasedTableOptions, + super_block_alignment_space_overhead_ratio), + OptionType::kSizeT, OptionVerificationType::kNormal}}, {"pin_top_level_index_and_filter", {offsetof(struct BlockBasedTableOptions, pin_top_level_index_and_filter), @@ -392,6 +417,9 @@ static struct BlockBasedTableTypeInfo { {offsetof(struct BlockBasedTableOptions, num_file_reads_for_auto_readahead), OptionType::kUInt64T, OptionVerificationType::kNormal}}, + {"fail_if_no_udi_on_open", + {offsetof(struct BlockBasedTableOptions, fail_if_no_udi_on_open), + OptionType::kBoolean, OptionVerificationType::kNormal}}, }; } } block_based_table_type_info; @@ -427,10 +455,10 @@ void BlockBasedTableFactory::InitializeOptions() { if (table_options_.no_block_cache) { table_options_.block_cache.reset(); } else if (table_options_.block_cache == nullptr) { - LRUCacheOptions co; - // 32MB, the recommended minimum size for 64 shards, to reduce contention - co.capacity = 32 << 20; - table_options_.block_cache = NewLRUCache(co); + // Now using AutoHCC by default, with existing default size of 32MB + // which is just one cache shard in HCC + HyperClockCacheOptions hcc_opts{size_t{32} << 20}; + table_options_.block_cache = hcc_opts.MakeSharedCache(); } if (table_options_.block_size_deviation < 0 || table_options_.block_size_deviation > 100) { @@ -467,6 +495,21 @@ void BlockBasedTableFactory::InitializeOptions() { options_overrides_iter->second.charged = options.charged; } } + + if (table_options_.format_version < kMinSupportedBbtFormatVersionForWrite) { + // In TEST mode, allow writing format versions that are at least supported + // for reading (so that we have a way of testing the read side). + if (TEST_AllowUnsupportedFormatVersion()) { + if (table_options_.format_version < + kMinSupportedBbtFormatVersionForRead) { + table_options_.format_version = kMinSupportedBbtFormatVersionForWrite; + } + } else { + table_options_.format_version = kMinSupportedBbtFormatVersionForWrite; + } + } + // NOTE: do not sanitize too high format_version, so that it can be rejected + // in validation } Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) { @@ -555,9 +598,11 @@ Status BlockBasedTableFactory::NewTableReader( file_size, table_reader_options.block_protection_bytes_per_key, table_reader, table_reader_options.tail_size, shared_state_->table_reader_cache_res_mgr, - table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, - table_reader_options.skip_filters, table_reader_options.level, - table_reader_options.immortal, table_reader_options.largest_seqno, + table_reader_options.prefix_extractor, + table_reader_options.compression_manager, + prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, + table_reader_options.level, table_reader_options.immortal, + table_reader_options.largest_seqno, table_reader_options.force_direct_prefetch, &shared_state_->tail_prefetch_stats, table_reader_options.block_cache_tracer, @@ -582,6 +627,14 @@ Status BlockBasedTableFactory::ValidateOptions( "Hash index is specified for block-based " "table, but prefix_extractor is not given"); } + if (table_options_.index_block_search_type == + BlockBasedTableOptions::kInterpolation) { + // Interpolation search requires BytewiseComparator + if (cf_opts.comparator != BytewiseComparator()) { + return Status::InvalidArgument( + "Interpolation search requires BytewiseComparator"); + } + } if (table_options_.cache_index_and_filter_blocks && table_options_.no_block_cache) { return Status::InvalidArgument( @@ -594,28 +647,71 @@ Status BlockBasedTableFactory::ValidateOptions( "Enable pin_l0_filter_and_index_blocks_in_cache, " ", but block cache is disabled"); } - if (!IsSupportedFormatVersion(table_options_.format_version)) { + // In TEST mode, also allow writing + // (a) old format_versions that for users are only supported for reads + // (b) future "draft" format versions that are not yet published to users + if (!(IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber, + table_options_.format_version) || + (TEST_AllowUnsupportedFormatVersion() && + table_options_.format_version >= + kMinSupportedBbtFormatVersionForRead))) { return Status::InvalidArgument( "Unsupported BlockBasedTable format_version. Please check " "include/rocksdb/table.h for more info"); } - if (table_options_.block_align && (cf_opts.compression != kNoCompression)) { - return Status::InvalidArgument( - "Enable block_align, but compression " - "enabled"); - } - if (table_options_.block_align && - cf_opts.bottommost_compression != kDisableCompressionOption && - cf_opts.bottommost_compression != kNoCompression) { - return Status::InvalidArgument( - "Enable block_align, but bottommost_compression enabled"); + bool using_builtin_compatible_compression = true; + if (cf_opts.compression_manager && + strcmp(cf_opts.compression_manager->CompatibilityName(), + GetBuiltinV2CompressionManager()->CompatibilityName()) != 0) { + if (FormatVersionUsesCompressionManagerName( + table_options_.format_version)) { + using_builtin_compatible_compression = false; + } else { + return Status::InvalidArgument( + "Using a CompressionManager incompatible with built-in (custom " + "CompatibilityName()) is not supported for format_version < 7"); + } } - if (table_options_.block_align) { - for (auto level_compression : cf_opts.compression_per_level) { - if (level_compression != kDisableCompressionOption && - level_compression != kNoCompression) { + auto validate_compression_type_fn = [&](CompressionType ctype, + const char* context) { + if (ctype == kNoCompression) { + return Status::OK(); + } + if (ctype == kDisableCompressionOption) { + if (strcmp(context, "compression") == 0) { return Status::InvalidArgument( - "Enable block_align, but compression_per_level enabled"); + "kDisableCompressionOption not permitted for option: " + "compression"); + } else { + return Status::OK(); + } + } + if (table_options_.block_align) { + return Status::InvalidArgument("Enable block_align, but " + + std::string(context) + " enabled"); + } + if (ctype > kLastBuiltinCompression && + using_builtin_compatible_compression) { + return Status::InvalidArgument( + "Using a CompressionType other than built-in ..."); // TODO + } + // Otherwise + return Status::OK(); + }; + { + Status s = validate_compression_type_fn(cf_opts.compression, "compression"); + if (!s.ok()) { + return s; + } + s = validate_compression_type_fn(cf_opts.bottommost_compression, + "bottommost_compression"); + if (!s.ok()) { + return s; + } + for (auto ctype : cf_opts.compression_per_level) { + s = validate_compression_type_fn(ctype, "compression_per_level"); + if (!s.ok()) { + return s; } } } @@ -628,6 +724,22 @@ Status BlockBasedTableFactory::ValidateOptions( return Status::InvalidArgument( "block size exceeds maximum number (4GiB) allowed"); } + if ((table_options_.super_block_alignment_size & + (table_options_.super_block_alignment_size - 1))) { + return Status::InvalidArgument( + "Super Block alignment requested but super block alignment size is not " + "a power of 2"); + } + if (table_options_.super_block_alignment_size > + std::numeric_limits::max()) { + return Status::InvalidArgument( + "Super block alignment size exceeds maximum number (4GiB) allowed"); + } + if (table_options_.super_block_alignment_space_overhead_ratio > 0 && + table_options_.super_block_alignment_space_overhead_ratio < 4) { + return Status::InvalidArgument( + "Super block alignment space overhead is too high"); + } if (table_options_.data_block_index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash && table_options_.data_block_hash_table_util_ratio <= 0) { @@ -635,6 +747,12 @@ Status BlockBasedTableFactory::ValidateOptions( "data_block_hash_table_util_ratio should be greater than 0 when " "data_block_index_type is set to kDataBlockBinaryAndHash"); } + if (table_options_.user_defined_index_factory && + (cf_opts.compression_opts.parallel_threads > 1 || + cf_opts.bottommost_compression_opts.parallel_threads > 1)) { + return Status::InvalidArgument( + "user_defined_index_factory not supported with parallel compression"); + } if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) { // TODO(myabandeh): support it return Status::InvalidArgument( @@ -806,6 +924,14 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const { ? "nullptr" : table_options_.filter_policy->Name()); ret.append(buffer); + snprintf(buffer, kBufferSize, " user_defined_index_factory: %s\n", + table_options_.user_defined_index_factory == nullptr + ? "nullptr" + : table_options_.user_defined_index_factory->Name()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " fail_if_no_udi_on_open: %d\n", + table_options_.fail_if_no_udi_on_open); + ret.append(buffer); snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", table_options_.whole_key_filtering); ret.append(buffer); @@ -824,6 +950,15 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const { snprintf(buffer, kBufferSize, " block_align: %d\n", table_options_.block_align); ret.append(buffer); + snprintf(buffer, kBufferSize, + " super_block_alignment_size: %" ROCKSDB_PRIszt "\n", + table_options_.super_block_alignment_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " super_block_alignment_space_overhead_ratio: %" ROCKSDB_PRIszt + "\n", + table_options_.super_block_alignment_space_overhead_ratio); + ret.append(buffer); snprintf(buffer, kBufferSize, " max_auto_readahead_size: %" ROCKSDB_PRIszt "\n", table_options_.max_auto_readahead_size); @@ -950,6 +1085,13 @@ TableFactory* NewBlockBasedTableFactory( return new BlockBasedTableFactory(_table_options); } +Status UserDefinedIndexFactory::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* factory) { + return LoadSharedObject(config_options, value, + factory); +} + const std::string BlockBasedTablePropertyNames::kIndexType = "rocksdb.block.based.table.index.type"; const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering = diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc index 3f55f82a77a5..e0e51469f6f3 100644 --- a/table/block_based/block_based_table_iterator.cc +++ b/table/block_based/block_based_table_iterator.cc @@ -37,6 +37,14 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, bool async_prefetch) { // TODO(hx235): set `seek_key_prefix_for_readahead_trimming_` // even when `target == nullptr` that is when `SeekToFirst()` is called + if (!multi_scan_status_.ok()) { + return; + } + if (multi_scan_) { + SeekMultiScan(target); + return; + } + if (target != nullptr && prefix_extractor_ && read_options_.prefix_same_as_start) { const Slice& seek_user_key = ExtractUserKey(*target); @@ -56,7 +64,7 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, ResetBlockCacheLookupVar(); bool autotune_readaheadsize = - is_first_pass && read_options_.auto_readahead_size && + read_options_.auto_readahead_size && (read_options_.iterate_upper_bound || read_options_.prefix_same_as_start); if (autotune_readaheadsize && @@ -181,6 +189,7 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, } void BlockBasedTableIterator::SeekForPrev(const Slice& target) { + multi_scan_.reset(); direction_ = IterDirection::kBackward; ResetBlockCacheLookupVar(); is_out_of_bound_ = false; @@ -255,6 +264,7 @@ void BlockBasedTableIterator::SeekForPrev(const Slice& target) { } void BlockBasedTableIterator::SeekToLast() { + multi_scan_.reset(); direction_ = IterDirection::kBackward; ResetBlockCacheLookupVar(); is_out_of_bound_ = false; @@ -278,7 +288,9 @@ void BlockBasedTableIterator::SeekToLast() { } void BlockBasedTableIterator::Next() { + assert(Valid()); if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { + assert(!multi_scan_); return; } assert(block_iter_points_to_real_block_); @@ -299,7 +311,9 @@ bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) { } void BlockBasedTableIterator::Prev() { - if (readahead_cache_lookup_ && !IsIndexAtCurr()) { + assert(!multi_scan_); + if ((readahead_cache_lookup_ && !IsIndexAtCurr()) || multi_scan_) { + multi_scan_.reset(); // In case of readahead_cache_lookup_, index_iter_ has moved forward. So we // need to reseek the index_iter_ to point to current block by using // block_iter_'s key. @@ -566,6 +580,10 @@ void BlockBasedTableIterator::FindKeyForward() { } void BlockBasedTableIterator::FindBlockForward() { + if (multi_scan_) { + FindBlockForwardInMultiScan(); + return; + } // TODO the while loop inherits from two-level-iterator. We don't know // whether a block can be empty so it can be replaced by an "if". do { @@ -749,7 +767,7 @@ void BlockBasedTableIterator::InitializeStartAndEndOffsets( // It can be when Reseek is from block cache (which doesn't clear the // buffers in FilePrefetchBuffer but clears block handles from queue) and // reseek also lies within the buffer. So Next will get data from - // exisiting buffers untill this callback is made to prefetch additional + // existing buffers until this callback is made to prefetch additional // data. All handles need to be added to the queue starting from // index_iter_. assert(index_iter_->Valid()); @@ -901,4 +919,505 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize( ResetPreviousBlockOffset(); } +// Note: +// - Iterator should not be reused for multiple multiscans or mixing +// multiscan with regular iterator usage. +// - scan ranges should be non-overlapping, and have increasing start keys. +// If a scan range's limit is not set, then there should only be one scan range. +// - After Prepare(), the iterator expects Seek to be called on the start key +// of each ScanOption in order. If any other Seek is done, an error status is +// returned +// - Whenever all blocks of a scan opt are exhausted, the iterator will become +// invalid and UpperBoundCheckResult() will return kOutOfBound. So that the +// upper layer (LevelIterator) will stop scanning instead thinking EOF is +// reached and continue into the next file. The only exception is for the last +// scan opt. If we reach the end of the last scan opt, UpperBoundCheckResult() +// will return kUnknown instead of kOutOfBound. This mechanism requires that +// scan opts are properly pruned such that there is no scan opt that is after +// this file's key range. +// FIXME: DBIter and MergingIterator may +// internally do Seek() on child iterators, e.g. due to +// ReadOptions::max_skippable_internal_keys or reseeking into range deletion +// end key. These Seeks will be handled properly, as long as the target is +// moving forward. +void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) { + assert(!multi_scan_); + RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_CALLS); + StopWatch sw(table_->get_rep()->ioptions.clock, table_->GetStatistics(), + MULTISCAN_PREPARE_MICROS); + + if (!index_iter_->status().ok()) { + multi_scan_status_ = index_iter_->status(); + RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS); + return; + } + if (multi_scan_) { + multi_scan_.reset(); + multi_scan_status_ = Status::InvalidArgument("Prepare already called"); + RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS); + return; + } + + index_iter_->Prepare(multiscan_opts); + + std::vector scan_block_handles; + std::vector data_block_separators; + std::vector> block_index_ranges_per_scan; + const std::vector& scan_opts = multiscan_opts->GetScanRanges(); + multi_scan_status_ = + CollectBlockHandles(scan_opts, &scan_block_handles, + &block_index_ranges_per_scan, &data_block_separators); + if (!multi_scan_status_.ok()) { + RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS); + return; + } + + // Calculate prefetch_max_idx (enforces max_prefetch_size) + size_t prefetch_max_idx = scan_block_handles.size(); + if (multiscan_opts->max_prefetch_size > 0) { + uint64_t total_size = 0; + for (size_t i = 0; i < scan_block_handles.size(); ++i) { + total_size += + BlockBasedTable::BlockSizeWithTrailer(scan_block_handles[i]); + if (total_size > multiscan_opts->max_prefetch_size) { + prefetch_max_idx = i; + break; + } + } + } + + // Create block handles vector for IODispatcher (limited to prefetch_max_idx) + std::vector blocks_to_prefetch; + if (prefetch_max_idx > 0) { + blocks_to_prefetch.assign(scan_block_handles.begin(), + scan_block_handles.begin() + prefetch_max_idx); + } + + // Submit to IODispatcher + auto job = std::make_shared(); + job->table = const_cast(table_); + job->block_handles = std::move(blocks_to_prefetch); + job->job_options.io_coalesce_threshold = + multiscan_opts->io_coalesce_threshold; + job->job_options.read_options = read_options_; + job->job_options.read_options.async_io = multiscan_opts->use_async_io; + + std::shared_ptr read_set; + // IODispatcher should be provided by DBIter::Prepare() to enable sharing + // across all BlockBasedTableIterators in the scan. Create one if not + // provided (for direct calls to Prepare, e.g., in unit tests). + std::shared_ptr dispatcher = multiscan_opts->io_dispatcher; + if (!dispatcher) { + dispatcher.reset(NewIODispatcher()); + } + multi_scan_status_ = dispatcher->SubmitJob(job, &read_set); + if (!multi_scan_status_.ok()) { + RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS); + return; + } + + // Successful Prepare, init related states so the iterator reads from prepared + // blocks. Note: data_block_separators keeps full size for seek logic. + multi_scan_ = std::make_unique( + table_->get_rep()->ioptions.env->GetFileSystem(), multiscan_opts, + std::move(read_set), std::move(data_block_separators), + std::move(block_index_ranges_per_scan), prefetch_max_idx, + table_->GetStatistics()); + + is_index_at_curr_block_ = false; + block_iter_points_to_real_block_ = false; +} + +void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) { + assert(multi_scan_ && multi_scan_status_.ok()); + // This is a MultiScan and Prepare() has been called. + + // Reset out of bound on seek, if it is out of bound again, it will be set + // properly later in the code path + is_out_of_bound_ = false; + + // Validate seek key with scan options + if (!seek_target) { + // start key must be set for multi-scan + multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan"); + RecordTick(table_->GetStatistics(), MULTISCAN_SEEK_ERRORS); + return; + } + + // Check the case where there is no range prepared on this table + if (multi_scan_->scan_opts->size() == 0) { + // out of bound + MarkPreparedRangeExhausted(); + return; + } + + // Check whether seek key is moving forward. + if (multi_scan_->prev_seek_key_.empty() || + icomp_.Compare(*seek_target, multi_scan_->prev_seek_key_) > 0) { + // If seek key is empty or is larger than previous seek key, update the + // previous seek key. Otherwise use the previous seek key as the adjusted + // seek target moving forward. This prevents seek target going backward, + // which would visit pages that have been unpinned. + // This issue is caused by sub-optimal range delete handling inside merge + // iterator. + // TODO xingbo issues:14068 : Optimize the handling of range delete iterator + // inside merge iterator, so that it doesn't move seek key backward. After + // that we could return error if the key moves backward here. + multi_scan_->prev_seek_key_ = seek_target->ToString(); + } else { + // Seek key is adjusted to previous one, we can return here directly. + return; + } + + // There are 3 different Cases we need to handle: + // The following diagram explain different seek targets seeking at various + // position on the table, while the next_scan_idx points to the PreparedRange + // 2. + // + // next_scan_idx: -------------------┐ + // ▼ + // table: : __[PreparedRange 1]__[PreparedRange 2]__[PreparedRange 3]__ + // Seek target: <----- Case 1 ------>▲<------------- Case 2 --------------> + // │ + // Case 3 + // + // Case 1: seek before the start of next prepared ranges. This could happen + // due to too many delete tomestone triggered reseek or delete range. + // Case 2: seek after the start of next prepared range. + // This could happen due to seek key adjustment from delete range file. + // E.g. LSM has 3 levels, each level has only 1 file: + // L1 : key : 0---10 + // L2 : Delete range key : 0-5 + // L3 : key : 0---10 + // When a range 2-8 was prepared, the prepared key would be 2 on L3 file, + // but the seek key would be 5, as the seek key was updated by the largest + // key of delete range. This causes all of the cases above to be possible, + // when the ranges are adjusted in the above examples. + // Case 3: seek at the beginning of a prepared range (expected case) + + // Allow reseek on the start of the last prepared range due to too many + // tombstone + multi_scan_->next_scan_idx = + std::min(multi_scan_->next_scan_idx, + multi_scan_->block_index_ranges_per_scan.size() - 1); + + auto user_seek_target = ExtractUserKey(*seek_target); + + auto compare_next_scan_start_result = + user_comparator_.CompareWithoutTimestamp( + user_seek_target, /*a_has_ts=*/true, + multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx] + .range.start.value(), + /*b_has_ts=*/false); + + if (compare_next_scan_start_result != 0) { + // The seek target is not exactly same as what was prepared. + if (compare_next_scan_start_result < 0) { + // Case 1: + if (multi_scan_->next_scan_idx == 0) { + // This should not happen, even when seek target is adjusted by delete + // range. The reason is that if the seek target is before the start key + // of the first prepared range, its end key needs to be >= the smallest + // key of this file, otherwise it is skipped in level iterator. If its + // end key is >= the smallest key of this file, then this range will be + // prepared for this file. As delete range could only adjust seek + // target forward, so it would never be before the start key of the + // first prepared range. + assert(false && "Seek target before the first prepared range"); + MarkPreparedRangeExhausted(); + return; + } + auto seek_target_before_previous_prepared_range = + user_comparator_.CompareWithoutTimestamp( + user_seek_target, /*a_has_ts=*/true, + multi_scan_->scan_opts + ->GetScanRanges()[multi_scan_->next_scan_idx - 1] + .range.start.value(), + /*b_has_ts=*/false) < 0; + // Not expected to happen + // This should never happen, the reason is that the + // multi_scan_->next_scan_idx is set to a non zero value is due to a seek + // target larger or equal to the start key of multi_scan_->next_scan_idx-1 + // happened earlier. If a seek happens before the start key of + // multi_scan_->next_scan_idx-1, it would seek a key that is less than + // what was seeked before. + assert(!seek_target_before_previous_prepared_range); + if (seek_target_before_previous_prepared_range) { + multi_scan_status_ = Status::InvalidArgument( + "Seek target is before the previous prepared range at index " + + std::to_string(multi_scan_->next_scan_idx)); + RecordTick(table_->GetStatistics(), MULTISCAN_SEEK_ERRORS); + return; + } + // It should only be possible to seek a key between the start of current + // prepared scan and start of next prepared range. + MultiScanUnexpectedSeekTarget(seek_target, &user_seek_target); + } else { + // Case 2: + MultiScanUnexpectedSeekTarget(seek_target, &user_seek_target); + } + } else { + // Case 2: + assert(multi_scan_->next_scan_idx < + multi_scan_->block_index_ranges_per_scan.size()); + + auto [cur_scan_start_idx, cur_scan_end_idx] = + multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx]; + // We should have the data block already loaded + ++multi_scan_->next_scan_idx; + if (cur_scan_start_idx >= cur_scan_end_idx) { + // No blocks are prepared for this range at current file. + MarkPreparedRangeExhausted(); + return; + } + + // max_sequential_skip_in_iterations can trigger a reseek on the start + // key of a scan range, even though the multiscan is already past + // `cur_scan_start_idx` (e.g., a user key spans multiple data blocks). + size_t block_idx = + std::max(cur_scan_start_idx, multi_scan_->cur_data_block_idx); + MultiScanSeekTargetFromBlock(seek_target, block_idx); + } +} + +void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget( + const Slice* seek_target, const Slice* user_seek_target) { + // linear search the block that contains the seek target, and unpin blocks + // that are before it. + + // The logic here could be confusing when there is a delete range involved. + // E.g. we have an LSM with 3 levels, each level has only 1 file: + // L1: data file : 0---10 + // L2: Delete range : 0-5 + // L3: data file : 0---10 + // + // MultiScan on ranges 1-2, 3-4, and 5-6. + // When user first do Seek(1), on level 2, due to delete range 0-5, the seek + // key is adjusted to 5 at level 3. Therefore, we will internally do Seek(5) + // and unpins all blocks until 5 at level 3. Then the next scan's blocks from + // 3-4 are unpinned at level 3. It is confusing that maybe block 3-4 should + // not be unpinned, as next scan would need it. But Seek(5) implies that these + // keys are all covered by some range deletion, so the next Seek(3) will also + // do Seek(5) internally, so the blocks from 3-4 could be safely unpinned. + + // advance to the right prepared range + while ( + multi_scan_->next_scan_idx < + multi_scan_->block_index_ranges_per_scan.size() && + (user_comparator_.CompareWithoutTimestamp( + *user_seek_target, /*a_has_ts=*/true, + multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx] + .range.start.value(), + /*b_has_ts=*/false) >= 0)) { + multi_scan_->next_scan_idx++; + } + + // next_scan_idx is guaranteed to be higher than 0. If the seek key is before + // the start key of first prepared range, it is already handled by caller + // SeekMultiScan. It is equal, it would not call this funciton. If it is + // after, next_scan_idx would be advanced by the loop above. + assert(multi_scan_->next_scan_idx > 0); + // Get the current range + auto cur_scan_idx = multi_scan_->next_scan_idx - 1; + auto [cur_scan_start_idx, cur_scan_end_idx] = + multi_scan_->block_index_ranges_per_scan[cur_scan_idx]; + + if (cur_scan_start_idx >= cur_scan_end_idx) { + // No blocks are prepared for this range at current file. + MarkPreparedRangeExhausted(); + return; + } + + // Unpin all the blocks from multi_scan_->cur_data_block_idx to + // cur_scan_start_idx - these are wasted (prefetched but skipped) + for (auto unpin_block_idx = multi_scan_->cur_data_block_idx; + unpin_block_idx < cur_scan_start_idx; unpin_block_idx++) { + // Count as wasted if it was prefetched + if (unpin_block_idx < multi_scan_->prefetch_max_idx) { + multi_scan_->wasted_blocks_count++; + } + multi_scan_->read_set->ReleaseBlock(unpin_block_idx); + } + + // Take the max here to ensure we don't move backwards. + size_t block_idx = + std::max(cur_scan_start_idx, multi_scan_->cur_data_block_idx); + auto const& data_block_separators = multi_scan_->data_block_separators; + while (block_idx < data_block_separators.size() && + (user_comparator_.CompareWithoutTimestamp( + *user_seek_target, /*a_has_ts=*/true, + data_block_separators[block_idx], + /*b_has_ts=*/false) > 0)) { + // Unpin the blocks that are passed - count as wasted if prefetched + if (block_idx < multi_scan_->prefetch_max_idx) { + multi_scan_->wasted_blocks_count++; + } + multi_scan_->read_set->ReleaseBlock(block_idx); + block_idx++; + } + + if (block_idx >= data_block_separators.size()) { + // All of the prepared blocks for this file is exhausted. + MarkPreparedRangeExhausted(); + return; + } + + // The current block may contain the data for the target key + MultiScanSeekTargetFromBlock(seek_target, block_idx); +} + +void BlockBasedTableIterator::MultiScanSeekTargetFromBlock( + const Slice* seek_target, size_t block_idx) { + assert(multi_scan_->cur_data_block_idx <= block_idx); + + if (!block_iter_points_to_real_block_ || + multi_scan_->cur_data_block_idx != block_idx) { + if (block_iter_points_to_real_block_) { + // Should be scan in increasing key range. + // All blocks before cur_data_block_idx_ are not pinned anymore. + assert(multi_scan_->cur_data_block_idx < block_idx); + } + + ResetDataIter(); + + if (MultiScanLoadDataBlock(block_idx)) { + return; + } + } + + // Move current data block index forward until block_idx, meantime, unpin all + // the blocks in between - these are wasted (prefetched but skipped) + while (multi_scan_->cur_data_block_idx < block_idx) { + // Count as wasted if it was prefetched + if (multi_scan_->cur_data_block_idx < multi_scan_->prefetch_max_idx) { + multi_scan_->wasted_blocks_count++; + } + multi_scan_->read_set->ReleaseBlock(multi_scan_->cur_data_block_idx); + multi_scan_->cur_data_block_idx++; + } + block_iter_points_to_real_block_ = true; + block_iter_.Seek(*seek_target); + FindKeyForward(); + CheckOutOfBound(); +} + +void BlockBasedTableIterator::FindBlockForwardInMultiScan() { + assert(multi_scan_); + assert(multi_scan_->next_scan_idx >= 1); + const auto cur_scan_end_idx = std::get<1>( + multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx - 1]); + do { + if (!block_iter_.status().ok()) { + return; + } + + // If is_out_of_bound_ is true, upper layer (LevelIterator) considers this + // level has reached iterate_upper_bound_ and will not continue to iterate + // into the next file. When we are doing the last scan within a MultiScan + // for this file, it may need to continue to scan into the next file, so + // we do not set is_out_of_bound_ in this case. + if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) { + MarkPreparedRangeExhausted(); + return; + } + // Move to the next pinned data block + ResetDataIter(); + // Unpin previous block via ReadSet + multi_scan_->read_set->ReleaseBlock(multi_scan_->cur_data_block_idx); + ++multi_scan_->cur_data_block_idx; + + if (MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx)) { + return; + } + + block_iter_points_to_real_block_ = true; + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +constexpr auto kVerbose = false; + +Status BlockBasedTableIterator::CollectBlockHandles( + const std::vector& scan_opts, + std::vector* scan_block_handles, + std::vector>* block_index_ranges_per_scan, + std::vector* data_block_separators) { + // print file name and level + if (UNLIKELY(kVerbose)) { + auto file_name = table_->get_rep()->file->file_name(); + auto level = table_->get_rep()->level; + printf("file name : %s, level %d\n", file_name.c_str(), level); + } + for (const auto& scan_opt : scan_opts) { + size_t num_blocks = 0; + bool check_overlap = !scan_block_handles->empty(); + + InternalKey start_key; + const size_t timestamp_size = + user_comparator_.user_comparator()->timestamp_size(); + if (timestamp_size == 0) { + start_key = InternalKey(scan_opt.range.start.value(), kMaxSequenceNumber, + kValueTypeForSeek); + } else { + std::string seek_key; + AppendKeyWithMaxTimestamp(&seek_key, scan_opt.range.start.value(), + timestamp_size); + start_key = InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek); + } + index_iter_->Seek(start_key.Encode()); + while (index_iter_->status().ok() && index_iter_->Valid() && + (!scan_opt.range.limit.has_value() || + user_comparator_.CompareWithoutTimestamp(index_iter_->user_key(), + /*a_has_ts*/ true, + *scan_opt.range.limit, + /*b_has_ts=*/false) < 0)) { + // Only add the block if the index separator is smaller than limit. When + // they are equal or larger, it will be handled later below. + if (check_overlap && + scan_block_handles->back() == index_iter_->value().handle) { + // Skip the current block since it's already in the list + } else { + scan_block_handles->push_back(index_iter_->value().handle); + // clone the Slice to avoid the lifetime issue + data_block_separators->push_back(index_iter_->user_key().ToString()); + } + ++num_blocks; + index_iter_->Next(); + check_overlap = false; + } + + if (!index_iter_->status().ok()) { + // Abort: index iterator error + return index_iter_->status(); + } + + if (index_iter_->Valid()) { + // Handle the last block when its separator is equal or larger than limit + if (check_overlap && + scan_block_handles->back() == index_iter_->value().handle) { + // Skip adding the current block since it's already in the list + } else { + scan_block_handles->push_back(index_iter_->value().handle); + data_block_separators->push_back(index_iter_->user_key().ToString()); + } + ++num_blocks; + } + block_index_ranges_per_scan->emplace_back( + scan_block_handles->size() - num_blocks, scan_block_handles->size()); + if (UNLIKELY(kVerbose)) { + printf("separators :"); + for (const auto& separator : *data_block_separators) { + printf("%s, ", separator.c_str()); + } + printf("\nblock_index_ranges_per_scan :"); + for (auto const& block_index_range : *block_index_ranges_per_scan) { + printf("[%zu, %zu], ", std::get<0>(block_index_range), + std::get<1>(block_index_range)); + } + printf("\n"); + } + } + return Status::OK(); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index d49224de4ac2..d7c4d409305b 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -10,6 +10,7 @@ #include #include "db/seqno_to_time_mapping.h" +#include "rocksdb/io_dispatcher.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_based_table_reader_impl.h" #include "table/block_based/block_prefetcher.h" @@ -41,11 +42,13 @@ class BlockBasedTableIterator : public InternalIteratorBase { compaction_readahead_size, table_->get_rep()->table_options.initial_auto_readahead_size), allow_unprepared_value_(allow_unprepared_value), - block_iter_points_to_real_block_(false), check_filter_(check_filter), need_upper_bound_check_(need_upper_bound_check), async_read_in_progress_(false), - is_last_level_(table->IsLastLevel()) {} + is_last_level_(table->IsLastLevel()), + block_iter_points_to_real_block_(false) { + multi_scan_status_.PermitUncheckedError(); + } ~BlockBasedTableIterator() override { ClearBlockHandles(); } @@ -57,7 +60,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { - return !is_out_of_bound_ && + return !is_out_of_bound_ && multi_scan_status_.ok() && (is_at_first_key_from_index_ || (block_iter_points_to_real_block_ && block_iter_.Valid())); } @@ -69,6 +72,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { Slice key() const override { assert(Valid()); if (is_at_first_key_from_index_) { + assert(!multi_scan_); return index_iter_->value().first_internal_key; } else { return block_iter_.key(); @@ -135,17 +139,25 @@ class BlockBasedTableIterator : public InternalIteratorBase { return block_iter_.value(); } Status status() const override { + if (!multi_scan_status_.ok()) { + return multi_scan_status_; + } // In case of block cache readahead lookup, it won't add the block to // block_handles if it's index is invalid. So index_iter_->status check can // be skipped. // Prefix index set status to NotFound when the prefix does not exist. if (IsIndexAtCurr() && !index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + assert(!multi_scan_); return index_iter_->status(); } else if (block_iter_points_to_real_block_) { + // This is the common case. return block_iter_.status(); } else if (async_read_in_progress_) { + assert(!multi_scan_); return Status::TryAgain("Async read in progress"); + } else if (multi_scan_) { + return multi_scan_status_; } else { return Status::OK(); } @@ -157,6 +169,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { } else if (block_upper_bound_check_ == BlockUpperBound::kUpperBoundBeyondCurBlock) { assert(!is_out_of_bound_); + // MultiScan does not do block level upper bound check yet. + assert(!multi_scan_); return IterBoundCheck::kInbound; } else { return IterBoundCheck::kUnknown; @@ -222,12 +236,21 @@ class BlockBasedTableIterator : public InternalIteratorBase { } } + void Prepare(const MultiScanArgs* scan_opts) override; + FilePrefetchBuffer* prefetch_buffer() { return block_prefetcher_.prefetch_buffer(); } std::unique_ptr> index_iter_; + bool TEST_IsBlockPinnedByMultiScan(size_t block_idx) { + if (!multi_scan_ || !multi_scan_->read_set) { + return false; + } + return multi_scan_->read_set->IsBlockAvailable(block_idx); + } + private: enum class IterDirection { kForward, @@ -308,12 +331,20 @@ class BlockBasedTableIterator : public InternalIteratorBase { BlockPrefetcher block_prefetcher_; + // It stores all the block handles that are lookuped in cache ahead when + // BlockCacheLookupForReadAheadSize is called. Since index_iter_ may point to + // different blocks when readahead_size is calculated in + // BlockCacheLookupForReadAheadSize, to avoid index_iter_ reseek, + // block_handles_ is used. + // `block_handles_` is lazily constructed to save CPU when it is unused + std::unique_ptr> block_handles_; + + // The prefix of the key called with SeekImpl(). + // This is for readahead trimming so no data blocks containing keys of a + // different prefix are prefetched + std::string seek_key_prefix_for_readahead_trimming_ = ""; + const bool allow_unprepared_value_; - // True if block_iter_ is initialized and points to the same block - // as index iterator. - bool block_iter_points_to_real_block_; - // See InternalIteratorBase::IsOutOfBound(). - bool is_out_of_bound_ = false; // How current data block's boundary key with the next block is compared with // iterate upper bound. BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown; @@ -333,18 +364,6 @@ class BlockBasedTableIterator : public InternalIteratorBase { // size based on cache hit and miss. bool readahead_cache_lookup_ = false; - // It stores all the block handles that are lookuped in cache ahead when - // BlockCacheLookupForReadAheadSize is called. Since index_iter_ may point to - // different blocks when readahead_size is calculated in - // BlockCacheLookupForReadAheadSize, to avoid index_iter_ reseek, - // block_handles_ is used. - // `block_handles_` is lazily constructed to save CPU when it is unused - std::unique_ptr> block_handles_; - - // During cache lookup to find readahead size, index_iter_ is iterated and it - // can point to a different block. is_index_at_curr_block_ keeps track of - // that. - bool is_index_at_curr_block_ = true; bool is_index_out_of_bound_ = false; // Used in case of auto_readahead_size to disable the block_cache lookup if @@ -353,10 +372,99 @@ class BlockBasedTableIterator : public InternalIteratorBase { // is used to disable the lookup. IterDirection direction_ = IterDirection::kForward; - // The prefix of the key called with SeekImpl(). - // This is for readahead trimming so no data blocks containing keys of a - // different prefix are prefetched - std::string seek_key_prefix_for_readahead_trimming_ = ""; + //*** BEGIN States used by both regular scan and multiscan + + // True if block_iter_ is initialized and points to the same block + // as index iterator. + bool block_iter_points_to_real_block_; + // See InternalIteratorBase::IsOutOfBound(). + bool is_out_of_bound_ = false; + + // Mark prepared ranges as exhausted for multiscan. + void MarkPreparedRangeExhausted() { + assert(multi_scan_ != nullptr); + if (multi_scan_->next_scan_idx < + multi_scan_->block_index_ranges_per_scan.size()) { + // If there are more prepared ranges, we don't ResetDataIter() here, + // because next scan might be reading from the same block. ResetDataIter() + // will free the underlying block cache handle and we don't want the + // block to be unpinned. + // Set out of bound to mark the current prepared range as exhausted. + is_out_of_bound_ = true; + } else { + // This is the last prepared range of this file, there might be more + // data on next file. Reset data iterator to indicate the iterator is + // no longer valid on this file. Let LevelIter advance to the next file + // instead of ending the scan. + ResetDataIter(); + } + } + + // During cache lookup to find readahead size, index_iter_ is iterated and it + // can point to a different block. + // If Prepare() is called, index_iter_ is used to prefetch data blocks for the + // multiscan, so is_index_at_curr_block_ will be false. + // Whether index is expected to match the current data_block_iter_. + bool is_index_at_curr_block_ = true; + + // *** END States used by both regular scan and multiscan + + // *** BEGIN MultiScan related states *** + struct MultiScanState { + // For Aborting async I/Os in destructor. + const std::shared_ptr fs; + const MultiScanArgs* scan_opts; + // ReadSet owns pinned data blocks and handles async I/O + std::shared_ptr read_set; + // The separator of each data block. + // Its size is same as the number of block handles submitted to + // IODispatcher. The value of separator is larger than or equal to the last + // key in the corresponding data block. + std::vector data_block_separators; + // Track previously seeked key in multi-scan. + // This is used to ensure that the seek key is keep moving forward, as + // blocks that are smaller than the seek key are unpinned from memory. + std::string prev_seek_key_; + + // Indicies into block handles for data blocks for each scan range. + // inclusive start, exclusive end + std::vector> block_index_ranges_per_scan; + size_t next_scan_idx; + size_t cur_data_block_idx; + size_t prefetch_max_idx; + + // For tracking wasted prefetch blocks (prefetched but never read) + Statistics* statistics; + size_t wasted_blocks_count; + + MultiScanState( + const std::shared_ptr& _fs, const MultiScanArgs* _scan_opts, + std::shared_ptr&& _read_set, + std::vector&& _data_block_separators, + std::vector>&& _block_index_ranges_per_scan, + size_t _prefetch_max_idx, Statistics* _statistics) + : fs(_fs), + scan_opts(_scan_opts), + read_set(std::move(_read_set)), + data_block_separators(std::move(_data_block_separators)), + block_index_ranges_per_scan(std::move(_block_index_ranges_per_scan)), + next_scan_idx(0), + cur_data_block_idx(0), + prefetch_max_idx(_prefetch_max_idx), + statistics(_statistics), + wasted_blocks_count(0) {} + + ~MultiScanState() { + if (statistics && wasted_blocks_count > 0) { + RecordTick(statistics, MULTISCAN_PREFETCH_BLOCKS_WASTED, + wasted_blocks_count); + } + } + }; + + Status multi_scan_status_; + std::unique_ptr multi_scan_; + // *** END MultiScan related APIs and states *** void SeekSecondPass(const Slice* target); @@ -472,5 +580,55 @@ class BlockBasedTableIterator : public InternalIteratorBase { uint64_t& end_updated_offset, size_t& prev_handles_size); // *** END APIs relevant to auto tuning of readahead_size *** + + // *** BEGIN APIs relevant to multiscan *** + + void SeekMultiScan(const Slice* target); + + void FindBlockForwardInMultiScan(); + + void MultiScanSeekTargetFromBlock(const Slice* seek_target, size_t block_idx); + void MultiScanUnexpectedSeekTarget(const Slice* seek_target, + const Slice* user_seek_target); + + // Return true, if there is an error, or end of file + bool MultiScanLoadDataBlock(size_t idx) { + if (idx >= multi_scan_->prefetch_max_idx) { + // TODO: Fix the max_prefetch_size support for multiple files. + // The goal is to limit the memory usage, prefetch could be done + // incrementally. + if (multi_scan_->scan_opts->max_prefetch_size == 0) { + // If max_prefetch_size is not set, treat this as end of file. + ResetDataIter(); + assert(!is_out_of_bound_); + assert(!Valid()); + } else { + // If max_prefetch_size is set, treat this as error. + multi_scan_status_ = Status::PrefetchLimitReached(); + } + return true; + } + + // Use ReadSet to get block (handles cache/async/sync transparently) + CachableEntry block_entry; + multi_scan_status_ = multi_scan_->read_set->ReadIndex(idx, &block_entry); + if (!multi_scan_status_.ok()) { + return true; + } + + assert(block_entry.GetValue()); + // Note that the block_iter_ takes ownership of the pinned data block + table_->NewDataBlockIterator(read_options_, block_entry, + &block_iter_, Status::OK()); + return false; + } + + Status CollectBlockHandles( + const std::vector& scan_opts, + std::vector* scan_block_handles, + std::vector>* block_index_ranges_per_scan, + std::vector* data_block_boundary_keys); + + // *** END APIs relevant to multiscan *** }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 103f687f812c..1de0096f4a72 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -46,6 +46,7 @@ #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "rocksdb/trace_record.h" +#include "rocksdb/user_defined_index.h" #include "table/block_based/binary_search_index_reader.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" @@ -58,6 +59,7 @@ #include "table/block_based/hash_index_reader.h" #include "table/block_based/partitioned_filter_block.h" #include "table/block_based/partitioned_index_reader.h" +#include "table/block_based/user_defined_index_wrapper.h" #include "table/block_fetcher.h" #include "table/format.h" #include "table/get_context.h" @@ -91,28 +93,33 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) { #define INSTANTIATE_BLOCKLIKE_TEMPLATES(T) \ template Status BlockBasedTable::RetrieveBlock( \ FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro, \ - const BlockHandle& handle, const UncompressionDict& uncompression_dict, \ + const BlockHandle& handle, UnownedPtr decomp, \ CachableEntry* out_parsed_block, GetContext* get_context, \ BlockCacheLookupContext* lookup_context, bool for_compaction, \ bool use_cache, bool async_read, bool use_block_cache_for_lookup) const; \ template Status BlockBasedTable::MaybeReadBlockAndLoadToCache( \ FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro, \ - const BlockHandle& handle, const UncompressionDict& uncompression_dict, \ + const BlockHandle& handle, UnownedPtr decomp, \ bool for_compaction, CachableEntry* block_entry, \ GetContext* get_context, BlockCacheLookupContext* lookup_context, \ BlockContents* contents, bool async_read, \ bool use_block_cache_for_lookup) const; \ template Status BlockBasedTable::LookupAndPinBlocksInCache( \ const ReadOptions& ro, const BlockHandle& handle, \ + CachableEntry* out_parsed_block) const; \ + template Status BlockBasedTable::CreateAndPinBlockInCache( \ + const ReadOptions& ro, const BlockHandle& handle, \ + UnownedPtr decomp, BlockContents* block_contents, \ CachableEntry* out_parsed_block) const; INSTANTIATE_BLOCKLIKE_TEMPLATES(ParsedFullFilterBlock); -INSTANTIATE_BLOCKLIKE_TEMPLATES(UncompressionDict); +INSTANTIATE_BLOCKLIKE_TEMPLATES(DecompressorDict); INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kData); INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kIndex); INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kFilterPartitionIndex); INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kRangeDeletion); INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kMetaIndex); +INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kUserDefinedIndex); } // namespace ROCKSDB_NAMESPACE @@ -195,7 +202,7 @@ Status ReadAndParseBlockFromFile( const Footer& footer, const ReadOptions& options, const BlockHandle& handle, std::unique_ptr* result, const ImmutableOptions& ioptions, BlockCreateContext& create_context, bool maybe_compressed, - const UncompressionDict& uncompression_dict, + UnownedPtr decomp, const PersistentCacheOptions& cache_options, MemoryAllocator* memory_allocator, bool for_compaction, bool async_read) { assert(result); @@ -204,8 +211,8 @@ Status ReadAndParseBlockFromFile( BlockFetcher block_fetcher( file, prefetch_buffer, footer, options, handle, &contents, ioptions, /*do_uncompress*/ maybe_compressed, maybe_compressed, - TBlocklike::kBlockType, uncompression_dict, cache_options, - memory_allocator, nullptr, for_compaction); + TBlocklike::kBlockType, decomp, cache_options, memory_allocator, nullptr, + for_compaction); Status s; // If prefetch_buffer is not allocated, it will fallback to synchronous // reading of block contents. @@ -562,6 +569,110 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties, return Status::OK(); } + +Status GetDecompressor(const std::string& compression_name, + UnownedPtr compression_manager, + uint32_t table_format_version, + std::shared_ptr* out_decompressor) { + if (compression_name.empty()) { + // Very old file (before RocksDB 4.9.0) that might contain compressed + // blocks. Get a general decompressor (for all supported format_versions) + auto mgr_to_use = GetBuiltinV2CompressionManager(); + *out_decompressor = mgr_to_use->GetDecompressor(); + return Status::OK(); + } + if (FormatVersionUsesCompressionManagerName(table_format_version)) { + constexpr char kFieldSep = ';'; + size_t separator_pos = compression_name.find_first_of(kFieldSep); + if (separator_pos == std::string::npos) { + return Status::Corruption( + "Missing separator in compression_name property"); + } + // Built with explicit CompressionManager and schema support for + // identifying its compatibility name, which is the first field here. + Slice compatibility_name(compression_name.data(), separator_pos); + std::shared_ptr mgr_to_use; + if (compression_manager) { + // First attempt to go through the compression manager configured for + // writing new files, for efficiency (usually correct) and not forcing + // use of ObjectLibrary registration (dependency injection). + mgr_to_use = compression_manager->FindCompatibleCompressionManager( + compatibility_name); + } + if (mgr_to_use == nullptr) { + ConfigOptions strict; + strict.ignore_unknown_options = false; + strict.ignore_unsupported_options = false; + Status s = CompressionManager::CreateFromString( + strict, compatibility_name.ToString(), &mgr_to_use); + // Even though we might be able to recover from "not found" if only + // built-in compression types are used (would be checked below), it + // would provide misleading or unreliable success to allow that to + // succeed. + if (!s.ok()) { + return s; + } + assert(mgr_to_use || compatibility_name == kNullptrString || + compatibility_name.empty()); + } + + // Second field is set of compression types actually used in the file + size_t start_pos = separator_pos + 1; + separator_pos = compression_name.find_first_of(kFieldSep, start_pos); + if (UNLIKELY(separator_pos == std::string::npos)) { + return Status::Corruption("Missing second field from compression_name"); + } + if (UNLIKELY((separator_pos - start_pos) & 1)) { + return Status::Corruption( + "Second field of compression_name has odd size"); + } + size_t count = (separator_pos - start_pos) / 2; + auto ctypes = std::make_unique(count); + const char* ptr = compression_name.data() + start_pos; + for (size_t i = 0; i < count; ++i) { + uint64_t val = 0; + bool success = ParseBaseChars<16>(&ptr, 2, &val); + if (UNLIKELY(!success || val == kNoCompression || + val >= kDisableCompressionOption)) { + return Status::Corruption( + "Error parsing second field of compression_name"); + } + ctypes[i] = static_cast(val); + } + if (mgr_to_use) { + *out_decompressor = mgr_to_use->GetDecompressorForTypes( + ctypes.get(), ctypes.get() + count); + assert(*out_decompressor || count == 0); + } else { + // Compression/decompression disabled + *out_decompressor = nullptr; + assert(count == 0); + } + // Can ignore possible additional future fields + } else { + // No explicit CompressionManager, e.g. legacy file support where + // decompressing with built-in CompressionManager works. + CompressionType saved_comp_type = + CompressionTypeFromString(compression_name); + if (saved_comp_type == kDisableCompressionOption) { + // Unrecognized. For RocksDB versions able to read format_version=7, + // this is considered an error so that we can continue to evolve the + // schema of the compression_name property and report good error + // messages. + return Status::Corruption("Unrecognized compression_name: " + + compression_name); + } else if (saved_comp_type != kNoCompression) { + // Use built-in compression manager + auto mgr_to_use = GetBuiltinV2CompressionManager(); + *out_decompressor = + mgr_to_use->GetDecompressorOptimizeFor(saved_comp_type); + } else { + // No compression -> decompressor not needed + *out_decompressor = nullptr; + } + } + return Status::OK(); +} } // namespace void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties, @@ -629,6 +740,7 @@ Status BlockBasedTable::Open( std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr, const std::shared_ptr& prefix_extractor, + UnownedPtr compression_manager, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, const int level, const bool immortal_table, const SequenceNumber largest_seqno, const bool force_direct_prefetch, @@ -683,7 +795,8 @@ Status BlockBasedTable::Open( // 6. [meta block: index] // 7. [meta block: filter] IOOptions opts; - s = file->PrepareIOOptions(ro, opts); + IODebugContext dbg; + s = file->PrepareIOOptions(ro, opts, &dbg); if (s.ok()) { s = ReadFooterFromFile(opts, file.get(), *ioptions.fs, prefetch_buffer.get(), file_size, &footer, @@ -695,7 +808,9 @@ Status BlockBasedTable::Open( } return s; } - if (!IsSupportedFormatVersion(footer.format_version())) { + if (!IsSupportedFormatVersionForRead(kBlockBasedTableMagicNumber, + footer.format_version()) && + !TEST_AllowUnsupportedFormatVersion()) { return Status::Corruption( "Unknown Footer version. Maybe this file was created with newer " "version of RocksDB?"); @@ -738,13 +853,19 @@ Status BlockBasedTable::Open( return s; } + // Read compression metadata and configure decompressor + s = GetDecompressor( + rep->table_properties ? rep->table_properties->compression_name + : std::string{}, + compression_manager, footer.format_version(), &rep->decompressor); + if (!s.ok()) { + return s; + } + // Populate BlockCreateContext - bool blocks_definitely_zstd_compressed = - rep->table_properties && (rep->table_properties->compression_name == - CompressionTypeToString(kZSTD)); rep->create_context = BlockCreateContext( &rep->table_options, &rep->ioptions, rep->ioptions.stats, - blocks_definitely_zstd_compressed, block_protection_bytes_per_key, + rep->decompressor.get(), block_protection_bytes_per_key, rep->internal_comparator.user_comparator(), rep->index_value_is_full, rep->index_has_first_key); @@ -806,20 +927,18 @@ Status BlockBasedTable::Open( rep->table_prefix_extractor = prefix_extractor; } else { // Current prefix_extractor doesn't match table - if (rep->table_properties) { - //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions - // will need to use it - ConfigOptions config_options; - Status st = SliceTransform::CreateFromString( - config_options, rep->table_properties->prefix_extractor_name, - &(rep->table_prefix_extractor)); - if (!st.ok()) { - //**TODO: Should this be error be returned or swallowed? - ROCKS_LOG_ERROR(rep->ioptions.logger, - "Failed to create prefix extractor[%s]: %s", - rep->table_properties->prefix_extractor_name.c_str(), - st.ToString().c_str()); - } + //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions + // will need to use it + ConfigOptions config_options; + Status st = SliceTransform::CreateFromString( + config_options, rep->table_properties->prefix_extractor_name, + &(rep->table_prefix_extractor)); + if (!st.ok()) { + //**TODO: Should this be error be returned or swallowed? + ROCKS_LOG_ERROR(rep->ioptions.logger, + "Failed to create prefix extractor[%s]: %s", + rep->table_properties->prefix_extractor_name.c_str(), + st.ToString().c_str()); } } @@ -914,6 +1033,7 @@ Status BlockBasedTable::PrefetchTail( "TailPrefetchStats.", file->file_name().c_str(), tail_prefetch_size); } + TEST_SYNC_POINT("BlockBasedTable::PrefetchTail::TaiSizeNotRecorded"); } size_t prefetch_off; size_t prefetch_len; @@ -933,7 +1053,8 @@ Status BlockBasedTable::PrefetchTail( #endif // NDEBUG IOOptions opts; - Status s = file->PrepareIOOptions(ro, opts); + IODebugContext dbg; + Status s = file->PrepareIOOptions(ro, opts, &dbg); // Try file system prefetch if (s.ok() && !file->use_direct_io() && !force_direct_prefetch) { if (!file->Prefetch(opts, prefetch_off, prefetch_len).IsNotSupported()) { @@ -963,89 +1084,72 @@ Status BlockBasedTable::ReadPropertiesBlock( BlockHandle handle; s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle); + if (!s.ok()) { + return s; + } else if (handle.IsNull()) { + return Status::Corruption("Cannot find Properties block from file."); + } + + s = meta_iter->status(); + std::unique_ptr table_properties; + if (s.ok()) { + s = ReadTablePropertiesHelper( + ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer, + rep_->ioptions, &table_properties, nullptr /* memory_allocator */); + } + + if (!s.ok()) { + return s; + } + + assert(table_properties != nullptr); + rep_->table_properties = std::move(table_properties); + + s = rep_->seqno_to_time_mapping.DecodeFrom( + rep_->table_properties->seqno_to_time_mapping); if (!s.ok()) { ROCKS_LOG_WARN(rep_->ioptions.logger, - "Error when seeking to properties block from file: %s", + "Problem reading or processing seqno-to-time mapping: %s", s.ToString().c_str()); - } else if (!handle.IsNull()) { - s = meta_iter->status(); - std::unique_ptr table_properties; - if (s.ok()) { - s = ReadTablePropertiesHelper( - ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer, - rep_->ioptions, &table_properties, nullptr /* memory_allocator */); - } - IGNORE_STATUS_IF_ERROR(s); + } - if (!s.ok()) { - ROCKS_LOG_WARN(rep_->ioptions.logger, - "Encountered error while reading data from properties " - "block %s", - s.ToString().c_str()); - } else { - assert(table_properties != nullptr); - rep_->table_properties = std::move(table_properties); + // Read the table properties + rep_->whole_key_filtering &= IsFeatureSupported( + *(rep_->table_properties), + BlockBasedTablePropertyNames::kWholeKeyFiltering, rep_->ioptions.logger); + rep_->prefix_filtering &= IsFeatureSupported( + *(rep_->table_properties), BlockBasedTablePropertyNames::kPrefixFiltering, + rep_->ioptions.logger); - if (s.ok()) { - s = rep_->seqno_to_time_mapping.DecodeFrom( - rep_->table_properties->seqno_to_time_mapping); - } - if (!s.ok()) { - ROCKS_LOG_WARN( - rep_->ioptions.logger, - "Problem reading or processing seqno-to-time mapping: %s", - s.ToString().c_str()); - } - rep_->blocks_maybe_compressed = - rep_->table_properties->compression_name != - CompressionTypeToString(kNoCompression); - } - } else { - ROCKS_LOG_ERROR(rep_->ioptions.logger, - "Cannot find Properties block from file."); + rep_->index_key_includes_seq = + rep_->table_properties->index_key_is_user_key == 0; + rep_->index_value_is_full = + rep_->table_properties->index_value_is_delta_encoded == 0; + + // Read index_type from properties (required for format_version >= 2) + auto& props = rep_->table_properties->user_collected_properties; + auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (index_type_pos == props.end()) { + return Status::Corruption("Missing index type property"); + } + rep_->index_type = static_cast( + DecodeFixed32(index_type_pos->second.c_str())); + auto min_ts_pos = props.find("rocksdb.timestamp_min"); + if (min_ts_pos != props.end()) { + rep_->min_timestamp = Slice(min_ts_pos->second); + } + auto max_ts_pos = props.find("rocksdb.timestamp_max"); + if (max_ts_pos != props.end()) { + rep_->max_timestamp = Slice(max_ts_pos->second); } - // Read the table properties, if provided. - if (rep_->table_properties) { - rep_->whole_key_filtering &= - IsFeatureSupported(*(rep_->table_properties), - BlockBasedTablePropertyNames::kWholeKeyFiltering, - rep_->ioptions.logger); - rep_->prefix_filtering &= IsFeatureSupported( - *(rep_->table_properties), - BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger); - - rep_->index_key_includes_seq = - rep_->table_properties->index_key_is_user_key == 0; - rep_->index_value_is_full = - rep_->table_properties->index_value_is_delta_encoded == 0; - - // Update index_type with the true type. - // If table properties don't contain index type, we assume that the table - // is in very old format and has kBinarySearch index type. - auto& props = rep_->table_properties->user_collected_properties; - auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType); - if (index_type_pos != props.end()) { - rep_->index_type = static_cast( - DecodeFixed32(index_type_pos->second.c_str())); - } - auto min_ts_pos = props.find("rocksdb.timestamp_min"); - if (min_ts_pos != props.end()) { - rep_->min_timestamp = Slice(min_ts_pos->second); - } - auto max_ts_pos = props.find("rocksdb.timestamp_max"); - if (max_ts_pos != props.end()) { - rep_->max_timestamp = Slice(max_ts_pos->second); - } - - rep_->index_has_first_key = - rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey; - - s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, - &(rep_->global_seqno)); - if (!s.ok()) { - ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str()); - } + rep_->index_has_first_key = + rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey; + + s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, + &(rep_->global_seqno)); + if (!s.ok()) { + ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str()); } return s; } @@ -1197,13 +1301,75 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( if (!s.ok()) { return s; } + if (table_options.user_defined_index_factory != nullptr) { + std::string udi_name(table_options.user_defined_index_factory->Name()); + BlockHandle udi_block_handle; + + // Should we use FindOptionalMetaBlock here? + s = FindMetaBlock(meta_iter, kUserDefinedIndexPrefix + udi_name, + &udi_block_handle); + if (!s.ok()) { + RecordTick(rep_->ioptions.statistics.get(), + SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT); + if (table_options.fail_if_no_udi_on_open) { + ROCKS_LOG_ERROR(rep_->ioptions.logger, + "Failed to find the the UDI block %s in file %s; %s", + udi_name.c_str(), rep_->file->file_name().c_str(), + s.ToString().c_str()); + // MAke the status more informative + s = Status::Corruption(s.ToString(), rep_->file->file_name()); + return s; + } else { + // Emit a warning, but ignore the error status + ROCKS_LOG_WARN(rep_->ioptions.logger, + "Failed to find the the UDI block %s in file %s; %s", + udi_name.c_str(), rep_->file->file_name().c_str(), + s.ToString().c_str()); + s = Status::OK(); + } + } + + // If the UDI block size is 0, that means there's effectively no user + // defined index. In that case, skip setting up the reader. + if (udi_block_handle.size() > 0) { + // Read the block, and allocate on heap or pin in cache. The UDI block is + // not compressed. RetrieveBlock will verify the checksum. + if (s.ok()) { + s = RetrieveBlock(prefetch_buffer, ro, udi_block_handle, + rep_->decompressor.get(), &rep_->udi_block, + /*get_context=*/nullptr, lookup_context, + /*for_compaction=*/false, use_cache, + /*async_read=*/false, + /*use_block_cache_for_lookup=*/false); + } + if (s.ok()) { + assert(!rep_->udi_block.IsEmpty()); + + std::unique_ptr udi_reader; + UserDefinedIndexOption udi_option; + udi_option.comparator = rep_->internal_comparator.user_comparator(); + s = table_options.user_defined_index_factory->NewReader( + udi_option, rep_->udi_block.GetValue()->data, udi_reader); + if (s.ok()) { + if (udi_reader) { + index_reader = std::make_unique( + udi_name, std::move(index_reader), std::move(udi_reader)); + } else { + s = Status::Corruption("Failed to create UDI reader for " + + udi_name + " in file " + + rep_->file->file_name()); + } + } + } + } + } rep_->index_reader = std::move(index_reader); // The partitions of partitioned index are always stored in cache. They // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks - if (prefetch_all || pin_partition) { + if (s.ok() && (prefetch_all || pin_partition)) { s = rep_->index_reader->CacheDependencies(ro, pin_partition, prefetch_buffer); } @@ -1238,7 +1404,10 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( } } - if (!rep_->compression_dict_handle.IsNull()) { + // NOTE: before the fix to https://github.com/facebook/rocksdb/issues/12409, a + // file could have a (de)compression dictionary block without a configured + // compression, so we need to ignore the dictionary in that case. + if (!rep_->compression_dict_handle.IsNull() && rep_->decompressor) { std::unique_ptr uncompression_dict_reader; s = UncompressionDictReader::Create( this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned, @@ -1300,10 +1469,9 @@ Status BlockBasedTable::ReadMetaIndexBlock( Status s = ReadAndParseBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions, - rep_->create_context, true /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options, - GetMemoryAllocator(rep_->table_options), false /* for_compaction */, - false /* async_read */); + rep_->create_context, true /*maybe_compressed*/, rep_->decompressor.get(), + rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options), + false /* for_compaction */, false /* async_read */); if (!s.ok()) { ROCKS_LOG_ERROR(rep_->ioptions.logger, @@ -1342,7 +1510,7 @@ template WithBlocklikeCheck BlockBasedTable::GetDataBlockFromCache( const Slice& cache_key, BlockCacheInterface block_cache, CachableEntry* out_parsed_block, GetContext* get_context, - const UncompressionDict* dict) const { + UnownedPtr decomp) const { assert(out_parsed_block); assert(out_parsed_block->IsEmpty()); @@ -1351,12 +1519,24 @@ WithBlocklikeCheck BlockBasedTable::GetDataBlockFromCache( // Lookup uncompressed cache first if (block_cache) { - BlockCreateContext create_ctx = rep_->create_context; - create_ctx.dict = dict; assert(!cache_key.empty()); - auto cache_handle = block_cache.LookupFull( - cache_key, &create_ctx, GetCachePriority(), statistics, - rep_->ioptions.lowest_used_cache_tier); + typename BlockCacheInterface::TypedHandle* cache_handle; + if (decomp.get() != rep_->decompressor.get() && decomp) { + // `decomp` must be a dictionary-aware decompressor, which is only + // available in the block cache (so that dictionaries can be evicted + // from memory) and can't live in the table reader. + // NOTE: inefficient BlockCreateContext copy for dict-aware decompressor + // (see TODO in block_cache.h) + BlockCreateContext create_ctx = rep_->create_context; + create_ctx.decompressor = decomp.get(); + cache_handle = block_cache.LookupFull( + cache_key, &create_ctx, GetCachePriority(), statistics, + rep_->ioptions.lowest_used_cache_tier); + } else { + cache_handle = block_cache.LookupFull( + cache_key, &rep_->create_context, GetCachePriority(), + statistics, rep_->ioptions.lowest_used_cache_tier); + } // Avoid updating metrics here if the handle is not complete yet. This // happens with MultiGet and secondary cache. So update the metrics only @@ -1386,10 +1566,9 @@ WithBlocklikeCheck BlockBasedTable::PutDataBlockToCache( CachableEntry* out_parsed_block, BlockContents&& uncompressed_block_contents, BlockContents&& compressed_block_contents, CompressionType block_comp_type, - const UncompressionDict& uncompression_dict, - MemoryAllocator* memory_allocator, GetContext* get_context) const { + UnownedPtr decomp, MemoryAllocator* memory_allocator, + GetContext* get_context) const { const ImmutableOptions& ioptions = rep_->ioptions; - const uint32_t format_version = rep_->table_options.format_version; assert(out_parsed_block); assert(out_parsed_block->IsEmpty()); @@ -1401,12 +1580,10 @@ WithBlocklikeCheck BlockBasedTable::PutDataBlockToCache( uncompressed_block_contents.data.empty()) { assert(compressed_block_contents.data.data()); // Retrieve the uncompressed contents into a new buffer - UncompressionContext context(block_comp_type); - UncompressionInfo info(context, uncompression_dict, block_comp_type); - s = UncompressBlockData(info, compressed_block_contents.data.data(), - compressed_block_contents.data.size(), - &uncompressed_block_contents, format_version, - ioptions, memory_allocator); + s = DecompressBlockData( + compressed_block_contents.data.data(), + compressed_block_contents.data.size(), block_comp_type, *decomp, + &uncompressed_block_contents, ioptions, memory_allocator); if (!s.ok()) { return s; } @@ -1505,7 +1682,8 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator( rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats, /* total_order_seek */ true, rep->index_has_first_key, rep->index_key_includes_seq, rep->index_value_is_full, - block_contents_pinned, rep->user_defined_timestamps_persisted); + block_contents_pinned, rep->user_defined_timestamps_persisted, + nullptr /* prefix_index */, rep->table_options.index_block_search_type); } // Right now only called for Data blocks. @@ -1519,15 +1697,18 @@ Status BlockBasedTable::LookupAndPinBlocksInCache( assert(block_cache); Status s; - CachableEntry uncompression_dict; + CachableEntry cached_dict; if (rep_->uncompression_dict_reader) { s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( /* prefetch_buffer= */ nullptr, ro, /* get_context= */ nullptr, /* lookup_context= */ nullptr, - &uncompression_dict); + &cached_dict); if (!s.ok()) { return s; } + if (!cached_dict.GetValue()) { + return Status::Corruption("Success but no dictionary read"); + } } // Do the lookup. @@ -1536,14 +1717,20 @@ Status BlockBasedTable::LookupAndPinBlocksInCache( Statistics* statistics = rep_->ioptions.statistics.get(); - BlockCreateContext create_ctx = rep_->create_context; - create_ctx.dict = uncompression_dict.GetValue() - ? uncompression_dict.GetValue() - : &UncompressionDict::GetEmptyDict(); - - auto cache_handle = - block_cache.LookupFull(key, &create_ctx, GetCachePriority(), - statistics, rep_->ioptions.lowest_used_cache_tier); + typename BlockCacheInterface::TypedHandle* cache_handle; + if (cached_dict.GetValue()) { + // NOTE: inefficient BlockCreateContext copy for dict-aware decompressor + // (see TODO in block_cache.h) + BlockCreateContext create_ctx = rep_->create_context; + create_ctx.decompressor = cached_dict.GetValue()->decompressor_.get(); + cache_handle = block_cache.LookupFull( + key, &create_ctx, GetCachePriority(), statistics, + rep_->ioptions.lowest_used_cache_tier); + } else { + cache_handle = block_cache.LookupFull( + key, &rep_->create_context, GetCachePriority(), statistics, + rep_->ioptions.lowest_used_cache_tier); + } if (!cache_handle) { UpdateCacheMissMetrics(TBlocklike::kBlockType, /* get_context = */ nullptr); @@ -1563,6 +1750,59 @@ Status BlockBasedTable::LookupAndPinBlocksInCache( return s; } +template +Status BlockBasedTable::CreateAndPinBlockInCache( + const ReadOptions& ro, const BlockHandle& handle, + UnownedPtr decomp, BlockContents* contents, + CachableEntry* out_parsed_block) const { + CompressionType compression_type = GetBlockCompressionType(*contents); + // If we don't own the contents and we don't need to decompress, copy + // the block to heap in order to have ownership. If decompression is + // needed, then the decompressor will allocate a buffer. + if (!contents->own_bytes() && compression_type == kNoCompression) { + Slice src = Slice(contents->data.data(), BlockSizeWithTrailer(handle)); + *contents = BlockContents( + CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), src), + handle.size()); +#ifndef NDEBUG + contents->has_trailer = true; +#endif + } + + Status s; + if (ro.fill_cache) { + s = MaybeReadBlockAndLoadToCache(nullptr, ro, handle, decomp, + /*for_compaction=*/false, out_parsed_block, + nullptr, nullptr, contents, + /*async_read=*/false, + /*use_block_cache_for_lookup=*/true); + } + + if (!s.ok()) { + return s; + } + + // fill_cache could be false, or no block cache is configured. In that + // case, decompress if necessary and take ownership of the block + if (out_parsed_block->GetValue() == nullptr && contents != nullptr) { + BlockContents tmp_contents; + if (compression_type != kNoCompression) { + s = DecompressSerializedBlock(contents->data.data(), handle.size(), + compression_type, *decomp, &tmp_contents, + rep_->ioptions, + GetMemoryAllocator(rep_->table_options)); + } else { + tmp_contents = std::move(*contents); + } + if (s.ok()) { + std::unique_ptr block_holder; + rep_->create_context.Create(&block_holder, std::move(tmp_contents)); + out_parsed_block->SetOwnedValue(std::move(block_holder)); + } + } + return s; +} + // If contents is nullptr, this function looks up the block caches for the // data block referenced by handle, and read the block from disk if necessary. // If contents is non-null, it skips the cache lookup and disk read, since @@ -1572,7 +1812,7 @@ template WithBlocklikeCheck BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, + const BlockHandle& handle, UnownedPtr decomp, bool for_compaction, CachableEntry* out_parsed_block, GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents, bool async_read, @@ -1596,7 +1836,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( if (!contents) { if (use_block_cache_for_lookup) { s = GetDataBlockFromCache(key, block_cache, out_parsed_block, - get_context, &uncompression_dict); + get_context, decomp); // Value could still be null at this point, so check the cache handle // and update the read pattern for prefetching if (out_parsed_block->GetValue() || @@ -1624,9 +1864,8 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( ro.fill_cache) { Statistics* statistics = rep_->ioptions.stats; const bool maybe_compressed = - TBlocklike::kBlockType != BlockType::kFilter && - TBlocklike::kBlockType != BlockType::kCompressionDictionary && - rep_->blocks_maybe_compressed; + BlockTypeMaybeCompressed(TBlocklike::kBlockType) && + rep_->decompressor; // This flag, if true, tells BlockFetcher to return the uncompressed // block when ReadBlockContents() is called. const bool do_uncompress = maybe_compressed; @@ -1650,8 +1889,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( BlockFetcher block_fetcher( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed, - TBlocklike::kBlockType, uncompression_dict, - rep_->persistent_cache_options, + TBlocklike::kBlockType, decomp, rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options), /*allocator=*/nullptr); @@ -1666,7 +1904,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( s = block_fetcher.ReadBlockContents(); } - contents_comp_type = block_fetcher.get_compression_type(); + contents_comp_type = block_fetcher.compression_type(); if (get_context) { switch (TBlocklike::kBlockType) { case BlockType::kIndex: @@ -1698,7 +1936,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( // block in block_fetcher s = PutDataBlockToCache( key, block_cache, out_parsed_block, std::move(uncomp_contents), - std::move(comp_contents), contents_comp_type, uncompression_dict, + std::move(comp_contents), contents_comp_type, decomp, GetMemoryAllocator(rep_->table_options), get_context); } } else { @@ -1714,7 +1952,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( // the block to the cache. s = PutDataBlockToCache( key, block_cache, out_parsed_block, std::move(uncomp_contents), - std::move(comp_contents), contents_comp_type, uncompression_dict, + std::move(comp_contents), contents_comp_type, decomp, GetMemoryAllocator(rep_->table_options), get_context); } } @@ -1770,6 +2008,7 @@ BlockBasedTable::SaveLookupContextOrTraceRecord( trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; break; case BlockType::kIndex: + case BlockType::kUserDefinedIndex: trace_block_type = TraceType::kBlockTraceIndexBlock; break; default: @@ -1829,7 +2068,7 @@ void BlockBasedTable::FinishTraceRecord( template WithBlocklikeCheck BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, + const BlockHandle& handle, UnownedPtr decomp, CachableEntry* out_parsed_block, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction, bool use_cache, bool async_read, bool use_block_cache_for_lookup) const { @@ -1839,8 +2078,8 @@ WithBlocklikeCheck BlockBasedTable::RetrieveBlock( Status s; if (use_cache) { s = MaybeReadBlockAndLoadToCache( - prefetch_buffer, ro, handle, uncompression_dict, for_compaction, - out_parsed_block, get_context, lookup_context, + prefetch_buffer, ro, handle, decomp, for_compaction, out_parsed_block, + get_context, lookup_context, /*contents=*/nullptr, async_read, use_block_cache_for_lookup); if (!s.ok()) { @@ -1862,9 +2101,7 @@ WithBlocklikeCheck BlockBasedTable::RetrieveBlock( } const bool maybe_compressed = - TBlocklike::kBlockType != BlockType::kFilter && - TBlocklike::kBlockType != BlockType::kCompressionDictionary && - rep_->blocks_maybe_compressed; + BlockTypeMaybeCompressed(TBlocklike::kBlockType) && rep_->decompressor; std::unique_ptr block; { @@ -1873,9 +2110,9 @@ WithBlocklikeCheck BlockBasedTable::RetrieveBlock( StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram); s = ReadAndParseBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, - rep_->ioptions, rep_->create_context, maybe_compressed, - uncompression_dict, rep_->persistent_cache_options, - GetMemoryAllocator(rep_->table_options), for_compaction, async_read); + rep_->ioptions, rep_->create_context, maybe_compressed, decomp, + rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options), + for_compaction, async_read); if (get_context) { switch (TBlocklike::kBlockType) { @@ -2445,7 +2682,7 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options, } BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; IndexBlockIter iiter_on_stack; - auto iiter = NewIndexIterator(read_options, /*need_upper_bound_check=*/false, + auto iiter = NewIndexIterator(read_options, /*disable_prefix_seek=*/false, &iiter_on_stack, /*get_context=*/nullptr, &lookup_context); std::unique_ptr> iiter_unique_ptr; @@ -2482,7 +2719,7 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options, DataBlockIter biter; Status tmp_status; NewDataBlockIterator( - read_options, block_handle, &biter, /*type=*/BlockType::kData, + read_options, block_handle, &biter, /*block_type=*/BlockType::kData, /*get_context=*/nullptr, &lookup_context, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true); @@ -2497,7 +2734,8 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options, } Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, - TableReaderCaller caller) { + TableReaderCaller caller, + bool meta_blocks_only) { Status s; // Check Meta blocks std::unique_ptr metaindex; @@ -2512,6 +2750,9 @@ Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, } else { return s; } + if (meta_blocks_only) { + return s; + } // Check Data blocks IndexBlockIter iiter_on_stack; BlockCacheLookupContext context{caller}; @@ -2557,8 +2798,8 @@ Status BlockBasedTable::VerifyChecksumInBlocks( BlockFetcher block_fetcher( rep_->file.get(), &prefetch_buffer, rep_->footer, read_options, handle, &contents, rep_->ioptions, false /* decompress */, - false /*maybe_compressed*/, BlockType::kData, - UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); + false /*maybe_compressed*/, BlockType::kData, nullptr /*decompressor*/, + rep_->persistent_cache_options); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { break; @@ -2607,6 +2848,10 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( return BlockType::kIndex; } + if (meta_block_name.starts_with(kUserDefinedIndexPrefix)) { + return BlockType::kUserDefinedIndex; + } + if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) { // Obsolete but possible in old files return BlockType::kInvalid; @@ -2647,12 +2892,12 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks( // if it was checked on open. } else { // FIXME? Need to verify checksums of index and filter partitions? - s = BlockFetcher( - rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, - read_options, handle, &contents, rep_->ioptions, - false /* decompress */, false /*maybe_compressed*/, - GetBlockTypeForMetaBlockByName(meta_block_name), - UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options) + s = BlockFetcher(rep_->file.get(), nullptr /* prefetch buffer */, + rep_->footer, read_options, handle, &contents, + rep_->ioptions, false /* decompress */, + false /*maybe_compressed*/, + GetBlockTypeForMetaBlockByName(meta_block_name), + nullptr /*decompressor*/, rep_->persistent_cache_options) .ReadBlockContents(); } if (!s.ok()) { @@ -2703,7 +2948,7 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { std::unique_ptr> iiter(NewIndexIterator( - options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, + options, /*disable_prefix_seek=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_context=*/nullptr)); iiter->Seek(key); assert(iiter->status().ok()); @@ -2792,12 +3037,7 @@ uint64_t BlockBasedTable::ApproximateDataOffsetOf( } uint64_t BlockBasedTable::GetApproximateDataSize() { - // Should be in table properties unless super old version - if (rep_->table_properties) { - return rep_->table_properties->data_size; - } - // Fall back to rough estimate from footer - return rep_->footer.metaindex_handle().offset(); + return rep_->table_properties->data_size; } uint64_t BlockBasedTable::ApproximateOffsetOf(const ReadOptions& read_options, @@ -2910,9 +3150,9 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const { Status BlockBasedTable::GetKVPairsFromDataBlocks( const ReadOptions& read_options, std::vector* kv_pair_blocks) { std::unique_ptr> blockhandles_iter( - NewIndexIterator(read_options, /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*disable_prefix_seek=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, - /*lookup_contex=*/nullptr)); + /*lookup_context=*/nullptr)); Status s = blockhandles_iter->status(); if (!s.ok()) { @@ -2932,7 +3172,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( Status tmp_status; datablock_iter.reset(NewDataBlockIterator( read_options, blockhandles_iter->value().handle, - /*input_iter=*/nullptr, /*type=*/BlockType::kData, + /*input_iter=*/nullptr, /*block_type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true)); @@ -2964,7 +3204,8 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( return Status::OK(); } -Status BlockBasedTable::DumpTable(WritableFile* out_file) { +Status BlockBasedTable::DumpTable(WritableFile* out_file, + bool show_sequence_number_type) { WritableFileStringStreamAdapter out_file_wrapper(out_file); std::ostream out_stream(&out_file_wrapper); // Output Footer @@ -2972,6 +3213,17 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { "--------------------------------------\n"; out_stream << " " << rep_->footer.ToString() << "\n"; + // Output Checksum Type Legend + out_stream << "Block Checksum Type Legend:\n" + "--------------------------------------\n"; + out_stream << " 0 = kNoChecksum\n"; + out_stream << " 1 = kCRC32c\n"; + out_stream << " 2 = kxxHash\n"; + out_stream << " 3 = kxxHash64\n"; + out_stream << " 4 = kXXH3\n"; + out_stream << " (This file uses checksum type: " + << static_cast(rep_->footer.checksum_type()) << ")\n\n"; + // Output MetaIndex out_stream << "Metaindex Details:\n" "--------------------------------------\n"; @@ -2982,25 +3234,47 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex, &metaindex_iter); if (s.ok()) { + // Print metaindex block checksum + DumpBlockChecksumInfo(rep_->footer.metaindex_handle(), ro, + "Metaindex block", out_stream); + for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); metaindex_iter->Next()) { s = metaindex_iter->status(); if (!s.ok()) { return s; } + // Parse block handle from metaindex value + BlockHandle block_handle; + Slice input = metaindex_iter->value(); + Status handle_status = block_handle.DecodeFrom(&input); + + if (!handle_status.ok()) { + out_stream << " Skip the block with type " + << metaindex_iter->key().ToString() + << " due to error: " << handle_status.ToString() << "\n\n"; + continue; + } + if (metaindex_iter->key() == kPropertiesBlockName) { out_stream << " Properties block handle: " << metaindex_iter->value().ToString(true) << "\n"; + DumpBlockChecksumInfo(block_handle, ro, "Properties block", out_stream); } else if (metaindex_iter->key() == kCompressionDictBlockName) { out_stream << " Compression dictionary block handle: " << metaindex_iter->value().ToString(true) << "\n"; + DumpBlockChecksumInfo(block_handle, ro, "Compression dictionary block", + out_stream); } else if (strstr(metaindex_iter->key().ToString().c_str(), "filter.rocksdb.") != nullptr) { out_stream << " Filter block handle: " << metaindex_iter->value().ToString(true) << "\n"; + DumpBlockChecksumInfo(block_handle, ro, "Filter block", out_stream); } else if (metaindex_iter->key() == kRangeDelBlockName) { out_stream << " Range deletion block handle: " << metaindex_iter->value().ToString(true) << "\n"; + DumpBlockChecksumInfo(block_handle, ro, "Range deletion block", + out_stream); } } out_stream << "\n"; @@ -3032,7 +3306,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { // Output compression dictionary if (rep_->uncompression_dict_reader) { - CachableEntry uncompression_dict; + CachableEntry uncompression_dict; s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( nullptr /* prefetch_buffer */, ro, nullptr /* get_context */, nullptr /* lookup_context */, &uncompression_dict); @@ -3057,15 +3331,15 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { out_stream << "Range deletions:\n" "--------------------------------------\n"; for (; range_del_iter->Valid(); range_del_iter->Next()) { - DumpKeyValue(range_del_iter->key(), range_del_iter->value(), - out_stream); + DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_stream, + show_sequence_number_type); } out_stream << "\n"; } delete range_del_iter; } // Output Data blocks - s = DumpDataBlocks(out_stream); + s = DumpDataBlocks(out_stream, show_sequence_number_type); if (!s.ok()) { return s; @@ -3077,15 +3351,65 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { return Status::OK(); } +void BlockBasedTable::DumpBlockChecksumInfo(const BlockHandle& block_handle, + const ReadOptions& read_options, + const char* block_name, + std::ostream& out_stream) const { + if (rep_->footer.GetBlockTrailerSize() == 0) { + return; + } + + size_t block_size = static_cast(block_handle.size()); + size_t block_size_with_trailer = block_size + kBlockTrailerSize; + std::unique_ptr raw_block(new char[block_size_with_trailer]); + Slice raw_block_slice; + IOOptions opts; + IODebugContext dbg; + IOStatus io_s = rep_->file->PrepareIOOptions(read_options, opts, &dbg); + if (io_s.ok()) { + io_s = rep_->file->Read(opts, block_handle.offset(), + block_size_with_trailer, &raw_block_slice, + raw_block.get(), /*aligned_buf=*/nullptr, &dbg); + } + if (io_s.ok() && raw_block_slice.size() == block_size_with_trailer) { + const char* data = raw_block_slice.data(); + uint8_t compression_type_byte = static_cast(data[block_size]); + uint32_t stored_checksum = DecodeFixed32(data + block_size + 1); + uint32_t modifier = ChecksumModifierForContext( + rep_->footer.base_context_checksum(), block_handle.offset()); + uint32_t actual_checksum = stored_checksum - modifier; + out_stream << " " << block_name << " checksum type: " + << static_cast(rep_->footer.checksum_type()) + << " checksum value: 0x" << std::hex << actual_checksum + << std::dec << " offset: " << block_handle.offset() + << " size: " << block_size << " compression type: " + << static_cast(compression_type_byte) << "\n"; + } else { + out_stream << " ERROR: Failed to read " << block_name << " checksum info"; + if (!io_s.ok()) { + out_stream << " - " << io_s.ToString(); + } else if (raw_block_slice.size() != block_size_with_trailer) { + out_stream << " - read " << raw_block_slice.size() << " bytes, expected " + << block_size_with_trailer; + } + out_stream << "\n"; + } +} + Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { out_stream << "Index Details:\n" "--------------------------------------\n"; // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; + + // Print index block checksum information + DumpBlockChecksumInfo(rep_->index_handle, read_options, "Index block", + out_stream); + std::unique_ptr> blockhandles_iter( - NewIndexIterator(read_options, /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*disable_prefix_seek=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, - /*lookup_contex=*/nullptr)); + /*lookup_context=*/nullptr)); Status s = blockhandles_iter->status(); if (!s.ok()) { out_stream << "Can not read Index Block \n\n"; @@ -3130,13 +3454,14 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { return Status::OK(); } -Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { +Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream, + bool show_sequence_number_type) { // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; std::unique_ptr> blockhandles_iter( - NewIndexIterator(read_options, /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*disable_prefix_seek=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, - /*lookup_contex=*/nullptr)); + /*lookup_context=*/nullptr)); Status s = blockhandles_iter->status(); if (!s.ok()) { out_stream << "Can not read Index Block \n\n"; @@ -3163,13 +3488,17 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { out_stream << "Data Block # " << block_id << " @ " << blockhandles_iter->value().handle.ToString(true) << "\n"; + + // Read block checksum information + DumpBlockChecksumInfo(bh, read_options, "Data block", out_stream); + out_stream << "--------------------------------------\n"; std::unique_ptr datablock_iter; Status tmp_status; datablock_iter.reset(NewDataBlockIterator( read_options, blockhandles_iter->value().handle, - /*input_iter=*/nullptr, /*type=*/BlockType::kData, + /*input_iter=*/nullptr, /*block_type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true)); @@ -3187,7 +3516,8 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { out_stream << "Error reading the block - Skipped \n"; break; } - DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream); + DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream, + show_sequence_number_type); } out_stream << "\n"; } @@ -3209,14 +3539,26 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { } void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, - std::ostream& out_stream) { - InternalKey ikey; - ikey.DecodeFrom(key); + std::ostream& out_stream, + bool show_sequence_number_type) { + ParsedInternalKey result; + auto s = ParseInternalKey(key, &result, true); + if (!s.ok()) { + out_stream << "Error parsing internal key - Skipped \n"; + return; + } - out_stream << " HEX " << ikey.user_key().ToString(true) << ": " - << value.ToString(true) << "\n"; + if (show_sequence_number_type) { + out_stream << " HEX " << result.user_key.ToString(true) + << " seq: " << result.sequence + << " type: " << std::to_string(result.type) << " : " + << value.ToString(true) << "\n"; + } else { + out_stream << " HEX " << result.user_key.ToString(true) << ": " + << value.ToString(true) << "\n"; + } - std::string str_key = ikey.user_key().ToString(); + std::string str_key = result.user_key.ToString(); std::string str_value = value.ToString(); std::string res_key, res_value; char cspace = ' '; diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 513e517aa85a..4663a83d5721 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -34,6 +34,7 @@ #include "table/two_level_iterator.h" #include "trace_replay/block_cache_tracer.h" #include "util/atomic.h" +#include "util/cast_util.h" #include "util/coro_utils.h" #include "util/hash_containers.h" @@ -105,6 +106,7 @@ class BlockBasedTable : public TableReader { std::shared_ptr table_reader_cache_res_mgr = nullptr, const std::shared_ptr& prefix_extractor = nullptr, + UnownedPtr compression_manager = nullptr, bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false, int level = -1, const bool immortal_table = false, const SequenceNumber largest_seqno = 0, @@ -206,10 +208,12 @@ class BlockBasedTable : public TableReader { size_t ApproximateMemoryUsage() const override; // convert SST file to a human readable form - Status DumpTable(WritableFile* out_file) override; + Status DumpTable(WritableFile* out_file, + bool show_sequence_number_type = false) override; Status VerifyChecksum(const ReadOptions& readOptions, - TableReaderCaller caller) override; + TableReaderCaller caller, + bool meta_blocks_only = false) override; void MarkObsolete(uint32_t uncache_aggressiveness) override; @@ -226,11 +230,15 @@ class BlockBasedTable : public TableReader { // Create an iterator for index access. If iter is null, then a new object // is created on the heap, and the callee will have the ownership. - // If a non-null iter is passed in, it will be used, and the returned value - // is either the same as iter or a new on-heap object that - // wraps the passed iter. In the latter case the return value points - // to a different object then iter, and the callee has the ownership of the - // returned object. + // If a non-null iter is passed in, it may be used, and the returned value + // is either the same as iter or a new on-heap object. + // In the latter case the return value points to a different object then + // iter, and the callee has the ownership of the returned object. + // + // Under all circumstances, the caller MUST use the returned iterator + // for further operations. If the returned iterator != iter, then the + // caller MUST ensure that iter stays in scope until the returned + // iterator is destroyed. virtual InternalIteratorBase* NewIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* iter, GetContext* get_context, @@ -293,11 +301,21 @@ class BlockBasedTable : public TableReader { Status GetKVPairsFromDataBlocks(const ReadOptions& read_options, std::vector* kv_pair_blocks); + // Look up the block cache for the specified block. + // out_parsed_block is set to nullptr if the block is not found in the cache. template Status LookupAndPinBlocksInCache( const ReadOptions& ro, const BlockHandle& handle, CachableEntry* out_parsed_block) const; + // Create the block given in `block_contents` and insert it into block cache. + // `out_parsed_block` points to the inserted block if successful. + template + Status CreateAndPinBlockInCache( + const ReadOptions& ro, const BlockHandle& handle, + UnownedPtr decomp, BlockContents* block_contents, + CachableEntry* out_parsed_block) const; + struct Rep; Rep* get_rep() { return rep_; } @@ -364,7 +382,7 @@ class BlockBasedTable : public TableReader { template WithBlocklikeCheck MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, + const BlockHandle& handle, UnownedPtr decomp, bool for_compaction, CachableEntry* block_entry, GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents, bool async_read, @@ -376,7 +394,7 @@ class BlockBasedTable : public TableReader { template WithBlocklikeCheck RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, + const BlockHandle& handle, UnownedPtr decomp, CachableEntry* block_entry, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction, bool use_cache, bool async_read, bool use_block_cache_for_lookup) const; @@ -397,7 +415,7 @@ class BlockBasedTable : public TableReader { const MultiGetRange* batch, const autovector* handles, Status* statuses, CachableEntry* results, char* scratch, - const UncompressionDict& uncompression_dict, bool use_fs_scratch); + UnownedPtr decomp, bool use_fs_scratch); // Get the iterator from the index reader. // @@ -413,7 +431,7 @@ class BlockBasedTable : public TableReader { // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier InternalIteratorBase* NewIndexIterator( - const ReadOptions& read_options, bool need_upper_bound_check, + const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* input_iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) const; @@ -429,7 +447,7 @@ class BlockBasedTable : public TableReader { WithBlocklikeCheck GetDataBlockFromCache( const Slice& cache_key, BlockCacheInterface block_cache, CachableEntry* block, GetContext* get_context, - const UncompressionDict* dict) const; + UnownedPtr decomp) const; // Put a maybe compressed block to the corresponding block caches. // This method will perform decompression against block_contents if needed @@ -447,8 +465,7 @@ class BlockBasedTable : public TableReader { CachableEntry* cached_block, BlockContents&& uncompressed_block_contents, BlockContents&& compressed_block_contents, - CompressionType block_comp_type, - const UncompressionDict& uncompression_dict, + CompressionType block_comp_type, UnownedPtr decomp, MemoryAllocator* memory_allocator, GetContext* get_context) const; // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found @@ -533,9 +550,15 @@ class BlockBasedTable : public TableReader { // Helper functions for DumpTable() Status DumpIndexBlock(std::ostream& out_stream); - Status DumpDataBlocks(std::ostream& out_stream); + Status DumpDataBlocks(std::ostream& out_stream, + bool show_sequence_number_type = false); void DumpKeyValue(const Slice& key, const Slice& value, - std::ostream& out_stream); + std::ostream& out_stream, + bool show_sequence_number_type = false); + void DumpBlockChecksumInfo(const BlockHandle& block_handle, + const ReadOptions& read_options, + const char* block_name, + std::ostream& out_stream) const; // Returns false if prefix_extractor exists and is compatible with that used // in building the table file, otherwise true. @@ -543,6 +566,12 @@ class BlockBasedTable : public TableReader { bool TimestampMayMatch(const ReadOptions& read_options) const; + bool BlockTypeMaybeCompressed(BlockType type) const { + return type != BlockType::kFilter && + type != BlockType::kCompressionDictionary && + type != BlockType::kUserDefinedIndex; + } + // A cumulative data block file read in MultiGet lower than this size will // use a stack buffer static constexpr size_t kMultiGetReadStackBufSize = 8192; @@ -550,6 +579,8 @@ class BlockBasedTable : public TableReader { friend class PartitionedFilterBlockReader; friend class PartitionedFilterBlockTest; friend class DBBasicTest_MultiGetIOBufferOverrun_Test; + friend class ReadSet; + friend class IODispatcherTest; }; // Maintaining state of a two-level iteration on a partitioned index structure. @@ -589,7 +620,9 @@ struct BlockBasedTable::Rep { file_size(_file_size), level(_level), immortal_table(_immortal_table), - user_defined_timestamps_persisted(_user_defined_timestamps_persisted) {} + user_defined_timestamps_persisted(_user_defined_timestamps_persisted), + fs_prefetch_support(CheckFSFeatureSupport( + _ioptions.fs.get(), FSSupportedOps::kFSPrefetch)) {} ~Rep() { status.PermitUncheckedError(); } const ImmutableOptions& ioptions; const EnvOptions& env_options; @@ -650,9 +683,11 @@ struct BlockBasedTable::Rep { Slice min_timestamp; Slice max_timestamp; - // If false, blocks in this file are definitely all uncompressed. Knowing this - // before reading individual blocks enables certain optimizations. - bool blocks_maybe_compressed = true; + // If blocks might be compressed, refers to a decompressor that can decompress + // them. (nullptr -> no blocks compressed) However, if (data) blocks are + // dictionary compressed, a dictionary-aware decompressor is needed, which + // might live in the block cache. + std::shared_ptr decompressor; // These describe how index is encoded. bool index_has_first_key = false; @@ -676,6 +711,8 @@ struct BlockBasedTable::Rep { // `end_key` for range deletion entries. const bool user_defined_timestamps_persisted; + const bool fs_prefetch_support; + // Set to >0 when the file is known to be obsolete and should have its block // cache entries evicted on close. NOTE: when the file becomes obsolete, // there could be multiple table cache references that all mark this file as @@ -686,6 +723,8 @@ struct BlockBasedTable::Rep { std::unique_ptr table_reader_cache_res_handle = nullptr; + CachableEntry udi_block; + SequenceNumber get_global_seqno(BlockType block_type) const { return (block_type == BlockType::kFilterPartitionIndex || block_type == BlockType::kCompressionDictionary) diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h index fd0db73af1de..288d3035565f 100644 --- a/table/block_based/block_based_table_reader_impl.h +++ b/table/block_based/block_based_table_reader_impl.h @@ -60,34 +60,33 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( } CachableEntry block; - if (rep_->uncompression_dict_reader && block_type == BlockType::kData) { - CachableEntry uncompression_dict; - // For async scans, don't use the prefetch buffer since an async prefetch - // might already be under way and this would invalidate it. Also, the - // uncompression dict is typically at the end of the file and would - // most likely break the sequentiality of the access pattern. - // Same is with auto_readahead_size. It iterates over index to lookup for - // data blocks. And this could break the the sequentiality of the access - // pattern. - s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - ((ro.async_io || ro.auto_readahead_size) ? nullptr : prefetch_buffer), - ro, get_context, lookup_context, &uncompression_dict); - if (!s.ok()) { - iter->Invalidate(s); - return iter; + { + CachableEntry dict; + Decompressor* decomp = rep_->decompressor.get(); + if (rep_->uncompression_dict_reader && block_type == BlockType::kData) { + // For async scans, don't use the prefetch buffer since an async prefetch + // might already be under way and this would invalidate it. Also, the + // uncompression dict is typically at the end of the file and would + // most likely break the sequentiality of the access pattern. + // Same is with auto_readahead_size. It iterates over index to lookup for + // data blocks. And this could break the the sequentiality of the access + // pattern. + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + ((ro.async_io || ro.auto_readahead_size) ? nullptr : prefetch_buffer), + ro, get_context, lookup_context, &dict); + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + assert(dict.GetValue()); + if (dict.GetValue()) { + decomp = dict.GetValue()->decompressor_.get(); + } } - const UncompressionDict& dict = uncompression_dict.GetValue() - ? *uncompression_dict.GetValue() - : UncompressionDict::GetEmptyDict(); s = RetrieveBlock( - prefetch_buffer, ro, handle, dict, &block.As(), + prefetch_buffer, ro, handle, decomp, &block.As(), get_context, lookup_context, for_compaction, /* use_cache */ true, async_read, use_block_cache_for_lookup); - } else { - s = RetrieveBlock( - prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(), - &block.As(), get_context, lookup_context, for_compaction, - /* use_cache */ true, async_read, use_block_cache_for_lookup); } if (s.IsTryAgain() && async_read) { diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h index 7ec152fc8e93..dc9e66214022 100644 --- a/table/block_based/block_based_table_reader_sync_and_async.h +++ b/table/block_based/block_based_table_reader_sync_and_async.h @@ -33,12 +33,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) (const ReadOptions& options, const MultiGetRange* batch, const autovector* handles, Status* statuses, CachableEntry* results, char* scratch, - const UncompressionDict& uncompression_dict, bool use_fs_scratch) const { + UnownedPtr decomp, bool use_fs_scratch) const { RandomAccessFileReader* file = rep_->file.get(); const Footer& footer = rep_->footer; const ImmutableOptions& ioptions = rep_->ioptions; - size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit; - MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options); if (ioptions.allow_mmap_reads) { size_t idx_in_batch = 0; @@ -51,7 +49,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) // XXX: use_cache=true means double cache query? statuses[idx_in_batch] = RetrieveBlock( - nullptr, options, handle, uncompression_dict, + nullptr, options, handle, decomp, &results[idx_in_batch].As(), mget_iter->get_context, /* lookup_context */ nullptr, /* for_compaction */ false, /* use_cache */ true, @@ -138,17 +136,18 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) AlignedBuf direct_io_buf; { IOOptions opts; - IOStatus s = file->PrepareIOOptions(options, opts); + IODebugContext dbg; + IOStatus s = file->PrepareIOOptions(options, opts, &dbg); if (s.ok()) { #if defined(WITH_COROUTINES) if (file->use_direct_io()) { #endif // WITH_COROUTINES s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(), - &direct_io_buf); + &direct_io_buf, &dbg); #if defined(WITH_COROUTINES) } else { co_await batch->context()->reader().MultiReadAsync( - file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf); + file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf, &dbg); } #endif // WITH_COROUTINES } @@ -221,7 +220,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) // in each read request. Checksum is stored in the block trailer, // beyond the payload size. s = VerifyBlockChecksum(footer, data, handle.size(), - rep_->file->file_name(), handle.offset()); + rep_->file->file_name(), handle.offset(), + BlockType::kData); RecordTick(ioptions.stats, BLOCK_CHECKSUM_COMPUTE_COUNT); if (!s.ok()) { RecordTick(ioptions.stats, BLOCK_CHECKSUM_MISMATCH_COUNT); @@ -240,15 +240,17 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) // its not a memory mapped file Slice result; IOOptions opts; - IOStatus io_s = file->PrepareIOOptions(options, opts); + IODebugContext dbg; + IOStatus io_s = file->PrepareIOOptions(options, opts, &dbg); opts.verify_and_reconstruct_read = true; io_s = file->Read(opts, handle.offset(), BlockSizeWithTrailer(handle), - &result, const_cast(data), nullptr); + &result, const_cast(data), nullptr, &dbg); if (io_s.ok()) { assert(result.data() == data); assert(result.size() == BlockSizeWithTrailer(handle)); s = VerifyBlockChecksum(footer, data, handle.size(), - rep_->file->file_name(), handle.offset()); + rep_->file->file_name(), handle.offset(), + BlockType::kData); if (s.ok()) { RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT); @@ -264,81 +266,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) } if (s.ok()) { - // When the blocks share the same underlying buffer (scratch or direct io - // buffer), we may need to manually copy the block into heap if the - // serialized block has to be inserted into a cache. That falls into the - // following cases - - // 1. serialized block is not compressed, it needs to be inserted into - // the uncompressed block cache if there is one - // 2. If the serialized block is compressed, it needs to be inserted - // into the compressed block cache if there is one - // - // In all other cases, the serialized block is either uncompressed into a - // heap buffer or there is no cache at all. - CompressionType compression_type = - GetBlockCompressionType(serialized_block); - if ((use_fs_scratch || use_shared_buffer) && - compression_type == kNoCompression) { - Slice serialized = - Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle)); - serialized_block = BlockContents( - CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), - serialized), - handle.size()); -#ifndef NDEBUG - serialized_block.has_trailer = true; -#endif - } - } - - if (s.ok()) { - if (options.fill_cache) { - CachableEntry* block_entry = &results[idx_in_batch]; - // MaybeReadBlockAndLoadToCache will insert into the block caches if - // necessary. Since we're passing the serialized block contents, it - // will avoid looking up the block cache - s = MaybeReadBlockAndLoadToCache( - nullptr, options, handle, uncompression_dict, - /*for_compaction=*/false, block_entry, mget_iter->get_context, - /*lookup_context=*/nullptr, &serialized_block, - /*async_read=*/false, /*use_block_cache_for_lookup=*/true); - - if (!s.ok()) { - statuses[idx_in_batch] = s; - continue; - } - // block_entry value could be null if no block cache is present, i.e - // BlockBasedTableOptions::no_block_cache is true and no compressed - // block cache is configured. In that case, fall - // through and set up the block explicitly - if (block_entry->GetValue() != nullptr) { - continue; - } - } - - CompressionType compression_type = - GetBlockCompressionType(serialized_block); - BlockContents contents; - if (compression_type != kNoCompression) { - UncompressionContext context(compression_type); - UncompressionInfo info(context, uncompression_dict, compression_type); - s = UncompressSerializedBlock( - info, req.result.data() + req_offset, handle.size(), &contents, - footer.format_version(), rep_->ioptions, memory_allocator); - } else { - // There are two cases here: - // 1) caller uses the shared buffer (scratch or direct io buffer); - // 2) we use the requst buffer. - // If scratch buffer or direct io buffer is used, we ensure that - // all serialized blocks are copyed to the heap as single blocks. If - // scratch buffer is not used, we also have no combined read, so the - // serialized block can be used directly. - contents = std::move(serialized_block); - } - if (s.ok()) { - results[idx_in_batch].SetOwnedValue(std::make_unique( - std::move(contents), read_amp_bytes_per_bit, ioptions.stats)); - } + s = CreateAndPinBlockInCache(options, handle, decomp, &serialized_block, + &results[idx_in_batch]); } statuses[idx_in_batch] = s; } @@ -421,10 +350,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) { MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), sst_file_range.end()); - CachableEntry uncompression_dict; - Status uncompression_dict_status; - uncompression_dict_status.PermitUncheckedError(); - bool uncompression_dict_inited = false; + CachableEntry dict; + Status dict_status; + dict_status.PermitUncheckedError(); + bool dict_inited = false; size_t total_len = 0; // GetContext for any key will do, as the stats will be aggregated @@ -466,26 +395,26 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) continue; } - if (!uncompression_dict_inited && rep_->uncompression_dict_reader) { - uncompression_dict_status = - rep_->uncompression_dict_reader - ->GetOrReadUncompressionDictionary( - nullptr /* prefetch_buffer */, read_options, - get_context, &metadata_lookup_context, - &uncompression_dict); - uncompression_dict_inited = true; + if (!dict_inited && rep_->uncompression_dict_reader) { + dict_status = rep_->uncompression_dict_reader + ->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, read_options, + get_context, &metadata_lookup_context, &dict); + dict_inited = true; } - if (!uncompression_dict_status.ok()) { - assert(!uncompression_dict_status.IsNotFound()); - *(miter->s) = uncompression_dict_status; + if (!dict_status.ok()) { + assert(!dict_status.IsNotFound()); + *(miter->s) = dict_status; data_block_range.SkipKey(miter); sst_file_range.SkipKey(miter); continue; + } else { + assert(!dict_inited || dict.GetValue() != nullptr); + } + if (dict.GetValue()) { + create_ctx.decompressor = dict.GetValue()->decompressor_.get(); } - create_ctx.dict = uncompression_dict.GetValue() - ? uncompression_dict.GetValue() - : &UncompressionDict::GetEmptyDict(); if (v.handle.offset() == prev_offset) { // This key can reuse the previous block (later on). @@ -565,11 +494,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) if (total_len) { char* scratch = nullptr; bool use_fs_scratch = false; - const UncompressionDict& dict = uncompression_dict.GetValue() - ? *uncompression_dict.GetValue() - : UncompressionDict::GetEmptyDict(); - assert(uncompression_dict_inited || !rep_->uncompression_dict_reader); - assert(uncompression_dict_status.ok()); + assert(dict_inited || !rep_->uncompression_dict_reader); + assert(dict_status.ok()); if (!rep_->file->use_direct_io()) { if (CheckFSFeatureSupport(rep_->ioptions.fs.get(), @@ -589,7 +515,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) // 3. If blocks are compressed and no compressed block cache, use // stack buf if (!use_fs_scratch && !rep_->file->use_direct_io() && - rep_->blocks_maybe_compressed) { + rep_->decompressor) { if (total_len <= kMultiGetReadStackBufSize) { scratch = stack_buf; } else { @@ -599,7 +525,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) } CO_AWAIT(RetrieveMultipleBlocks) (read_options, &data_block_range, &block_handles, &statuses[0], - &results[0], scratch, dict, use_fs_scratch); + &results[0], scratch, + dict.GetValue() ? dict.GetValue()->decompressor_.get() + : rep_->decompressor.get(), + use_fs_scratch); if (get_context) { ++(get_context->get_context_stats_.num_sst_read); } diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index 4a18b6fcda84..7b20759caa54 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -22,12 +22,16 @@ #include "rocksdb/options.h" #include "table/block_based/block_based_table_builder.h" #include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_iterator.h" #include "table/block_based/partitioned_index_iterator.h" #include "table/format.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/random.h" +// Enable io_uring support for this test +extern "C" bool RocksDbIOUringEnable() { return true; } + namespace ROCKSDB_NAMESPACE { class BlockBasedTableReaderBaseTest : public testing::Test { @@ -49,7 +53,8 @@ class BlockBasedTableReaderBaseTest : public testing::Test { // user defined timestamps and different sequence number to differentiate them static std::vector> GenerateKVMap( int num_block = 2, bool mixed_with_human_readable_string_value = false, - size_t ts_sz = 0, bool same_key_diff_ts = false) { + size_t ts_sz = 0, bool same_key_diff_ts = false, + const Comparator* comparator = BytewiseComparator()) { std::vector> kv; SequenceNumber seq_no = 0; @@ -97,6 +102,10 @@ class BlockBasedTableReaderBaseTest : public testing::Test { } } } + auto comparator_name = std::string(comparator->Name()); + if (comparator_name.find("Reverse") != std::string::npos) { + std::reverse(kv.begin(), kv.end()); + } return kv; } @@ -125,6 +134,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test { InternalKeyComparator comparator(ioptions.user_comparator); ColumnFamilyOptions cf_options; + cf_options.comparator = ioptions.user_comparator; cf_options.prefix_extractor = options_.prefix_extractor; MutableCFOptions moptions(cf_options); CompressionOptions compression_opts; @@ -163,16 +173,18 @@ class BlockBasedTableReaderBaseTest : public testing::Test { bool user_defined_timestamps_persisted = true) { const MutableCFOptions moptions(options_); TableReaderOptions table_reader_options = TableReaderOptions( - ioptions, moptions.prefix_extractor, foptions, comparator, - 0 /* block_protection_bytes_per_key */, false /* _skip_filters */, - false /* _immortal */, false /* _force_direct_prefetch */, - -1 /* _level */, nullptr /* _block_cache_tracer */, + ioptions, moptions.prefix_extractor, moptions.compression_manager.get(), + foptions, comparator, 0 /* block_protection_bytes_per_key */, + false /* _skip_filters */, false /* _immortal */, + false /* _force_direct_prefetch */, -1 /* _level */, + nullptr /* _block_cache_tracer */, 0 /* _max_file_size_for_l0_meta_pin */, "" /* _cur_db_session_id */, - 0 /* _cur_file_num */, {} /* _unique_id */, 0 /* _largest_seqno */, - 0 /* _tail_size */, user_defined_timestamps_persisted); + table_num_++ /* _cur_file_num */, {} /* _unique_id */, + 0 /* _largest_seqno */, 0 /* _tail_size */, + user_defined_timestamps_persisted); std::unique_ptr file; - NewFileReader(table_name, foptions, &file); + NewFileReader(table_name, foptions, &file, ioptions.statistics.get()); uint64_t file_size = 0; ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size)); @@ -190,6 +202,8 @@ class BlockBasedTableReaderBaseTest : public testing::Test { if (status) { *status = s; + } else { + ASSERT_OK(s); } } @@ -199,6 +213,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test { Env* env_; std::shared_ptr fs_; Options options_; + uint64_t table_num_{0}; private: void WriteToFile(const std::string& content, const std::string& filename) { @@ -219,15 +234,82 @@ class BlockBasedTableReaderBaseTest : public testing::Test { } void NewFileReader(const std::string& filename, const FileOptions& opt, - std::unique_ptr* reader) { + std::unique_ptr* reader, + Statistics* stats = nullptr) { std::string path = Path(filename); std::unique_ptr f; ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr)); reader->reset(new RandomAccessFileReader(std::move(f), path, - env_->GetSystemClock().get())); + env_->GetSystemClock().get(), + /*io_tracer=*/nullptr, + /*stats=*/stats)); } }; +struct BlockBasedTableReaderTestParam { + BlockBasedTableReaderTestParam( + CompressionType _compression_type, bool _use_direct_reads, + BlockBasedTableOptions::IndexType _index_type, bool _no_block_cache, + test::UserDefinedTimestampTestMode _udt_test_mode, + uint32_t _compression_parallel_threads, uint32_t _compression_dict_bytes, + bool _same_key_diff_ts, const Comparator* _comparator, bool _fill_cache, + bool _use_async_io, bool _block_align, size_t _super_block_alignment_size, + size_t _super_block_alignment_space_overhead_ratio) + : compression_type(_compression_type), + use_direct_reads(_use_direct_reads), + index_type(_index_type), + no_block_cache(_no_block_cache), + udt_test_mode(_udt_test_mode), + compression_parallel_threads(_compression_parallel_threads), + compression_dict_bytes(_compression_dict_bytes), + same_key_diff_ts(_same_key_diff_ts), + comparator(_comparator), + fill_cache(_fill_cache), + use_async_io(_use_async_io), + block_align(_block_align), + super_block_alignment_size(_super_block_alignment_size), + super_block_alignment_space_overhead_ratio( + _super_block_alignment_space_overhead_ratio) {} + + CompressionType compression_type; + bool use_direct_reads; + BlockBasedTableOptions::IndexType index_type; + bool no_block_cache; + test::UserDefinedTimestampTestMode udt_test_mode; + uint32_t compression_parallel_threads; + uint32_t compression_dict_bytes; + bool same_key_diff_ts; + const Comparator* comparator; + bool fill_cache; + bool use_async_io; + bool block_align; + size_t super_block_alignment_size; + size_t super_block_alignment_space_overhead_ratio; +}; + +// Define operator<< for SpotLockManagerTestParam to stop valgrind from +// complaining uinitialized value when printing SpotLockManagerTestParam. +std::ostream& operator<<(std::ostream& os, + const BlockBasedTableReaderTestParam& param) { + os << "compression_type: " << CompressionTypeToString(param.compression_type) + << " use_direct_reads: " << param.use_direct_reads + << " index_type: " << static_cast(param.index_type) + << " no_block_cache: " << param.no_block_cache + << " udt_test_mode: " << static_cast(param.udt_test_mode) + << " compression_parallel_threads: " << param.compression_parallel_threads + << " compression_dict_bytes: " << param.compression_dict_bytes + << " same_key_diff_ts: " << param.same_key_diff_ts + << " comparator: " << param.comparator->Name() + << " fill_cache: " << param.fill_cache + << " use_async_io: " << param.use_async_io + << " block_align: " << param.block_align + << " super_block_alignment_size: " << param.super_block_alignment_size + << " super_block_alignment_space_overhead_ratio: " + << param.super_block_alignment_space_overhead_ratio; + + return os; +} + // Param 1: compression type // Param 2: whether to use direct reads // Param 3: Block Based Table Index type @@ -244,28 +326,33 @@ class BlockBasedTableReaderBaseTest : public testing::Test { // generate keys with different user provided key, same user-defined // timestamps (if udt enabled), same sequence number. This test mode is // used for testing `Get`, `MultiGet`, and `NewIterator`. +// Param 9: test both the default comparator and a reverse comparator. class BlockBasedTableReaderTest : public BlockBasedTableReaderBaseTest, - public testing::WithParamInterface> { + public testing::WithParamInterface { protected: void SetUp() override { - compression_type_ = std::get<0>(GetParam()); - use_direct_reads_ = std::get<1>(GetParam()); - test::UserDefinedTimestampTestMode udt_test_mode = std::get<4>(GetParam()); + auto param = GetParam(); + compression_type_ = param.compression_type; + use_direct_reads_ = param.use_direct_reads; + test::UserDefinedTimestampTestMode udt_test_mode = param.udt_test_mode; udt_enabled_ = test::IsUDTEnabled(udt_test_mode); persist_udt_ = test::ShouldPersistUDT(udt_test_mode); - compression_parallel_threads_ = std::get<5>(GetParam()); - compression_dict_bytes_ = std::get<6>(GetParam()); - same_key_diff_ts_ = std::get<7>(GetParam()); + compression_parallel_threads_ = param.compression_parallel_threads; + compression_dict_bytes_ = param.compression_dict_bytes; + same_key_diff_ts_ = param.same_key_diff_ts; + comparator_ = param.comparator; BlockBasedTableReaderBaseTest::SetUp(); } void ConfigureTableFactory() override { BlockBasedTableOptions opts; - opts.index_type = std::get<2>(GetParam()); - opts.no_block_cache = std::get<3>(GetParam()); + auto param = GetParam(); + opts.index_type = param.index_type; + opts.no_block_cache = param.no_block_cache; + opts.super_block_alignment_size = param.super_block_alignment_size; + opts.super_block_alignment_space_overhead_ratio = + param.super_block_alignment_space_overhead_ratio; opts.filter_policy.reset(NewBloomFilterPolicy(10, false)); opts.partition_filters = opts.index_type == @@ -284,6 +371,7 @@ class BlockBasedTableReaderTest uint32_t compression_parallel_threads_; uint32_t compression_dict_bytes_; bool same_key_diff_ts_; + const Comparator* comparator_{}; }; class BlockBasedTableReaderGetTest : public BlockBasedTableReaderTest {}; @@ -987,61 +1075,924 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) { ASSERT_EQ(s.code(), Status::kCorruption); } -// Param 1: compression type -// Param 2: whether to use direct reads -// Param 3: Block Based Table Index type, partitioned filters are also enabled -// when index type is kTwoLevelIndexSearch -// Param 4: BBTO no_block_cache option -// Param 5: test mode for the user-defined timestamp feature -// Param 6: number of parallel compression threads -// Param 7: CompressionOptions.max_dict_bytes and -// CompressionOptions.max_dict_buffer_bytes. This enable/disables -// compression dictionary. -// Param 8: test mode to specify the pattern for generating key / value pairs. +class BlockBasedTableReaderMultiScanTest : public BlockBasedTableReaderTest { + public: + void SetUp() override { + BlockBasedTableReaderTest::SetUp(); + options_.comparator = comparator_; + } +}; + +class BlockBasedTableReaderMultiScanAsyncIOTest + : public BlockBasedTableReaderMultiScanTest {}; + +// TODO: test no block cache case +TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) { + auto param = GetParam(); + auto fill_cache = param.fill_cache; + auto use_async_io = param.use_async_io; + + options_.statistics = CreateDBStatistics(); + std::shared_ptr fs = options_.env->GetFileSystem(); + ReadOptions read_opts; + read_opts.fill_cache = fill_cache; + size_t ts_sz = options_.comparator->timestamp_size(); + std::vector> kv = + BlockBasedTableReaderBaseTest::GenerateKVMap( + 100 /* num_block */, + true /* mixed_with_human_readable_string_value */, ts_sz, + same_key_diff_ts_, comparator_); + std::string table_name = "BlockBasedTableReaderTest_NewIterator" + + CompressionTypeToString(compression_type_) + + "_async" + std::to_string(use_async_io); + ImmutableOptions ioptions(options_); + // Only insert 60 out of 100 blocks + CreateTable(table_name, ioptions, compression_type_, + std::vector>{ + kv.begin() + 20 * kEntriesPerBlock, + kv.begin() + 80 * kEntriesPerBlock}, + compression_parallel_threads_, compression_dict_bytes_); + + std::unique_ptr table; + FileOptions foptions; + foptions.use_direct_reads = use_direct_reads_; + InternalKeyComparator comparator(options_.comparator); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table, + true /* bool prefetch_index_and_filter_in_cache */, + nullptr /* status */, persist_udt_); + + // 1. Should coalesce into a single I/O + std::unique_ptr iter; + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + MultiScanArgs scan_options(comparator_); + scan_options.use_async_io = use_async_io; + scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first), + ExtractUserKey(kv[31 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[32 * kEntriesPerBlock].first), + ExtractUserKey(kv[33 * kEntriesPerBlock].first)); + auto read_count_before = + options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + + iter->Prepare(&scan_options); + iter->Seek(kv[30 * kEntriesPerBlock].first); + for (size_t i = 30 * kEntriesPerBlock; i <= 31 * kEntriesPerBlock; ++i) { + ASSERT_TRUE(iter->status().ok()) << iter->status().ToString(); + ASSERT_TRUE(iter->Valid()) << i; + ASSERT_EQ(iter->key().ToString(), kv[i].first); + iter->Next(); + } + // Iter may still be valid after scan range. Upper layer (DBIter) handles + // exact upper bound checking. So we don't check !iter->Valid() here. + ASSERT_OK(iter->status()); + iter->Seek(kv[32 * kEntriesPerBlock].first); + for (size_t i = 32 * kEntriesPerBlock; i < 33 * kEntriesPerBlock; ++i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), kv[i].first); + iter->Next(); + } + ASSERT_OK(iter->status()); + auto read_count_after = + options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + ASSERT_EQ(read_count_before + 1, read_count_after); + + // 2. No IO coalesce, should do MultiRead/ReadAsync with 2 read requests. + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + scan_options = MultiScanArgs(comparator_); + scan_options.insert(ExtractUserKey(kv[40 * kEntriesPerBlock].first), + ExtractUserKey(kv[45 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first), + ExtractUserKey(kv[75 * kEntriesPerBlock].first)); + + read_count_before = + options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + iter->Prepare(&scan_options); + + iter->Seek(kv[40 * kEntriesPerBlock].first); + for (size_t i = 40 * kEntriesPerBlock; i < 45 * kEntriesPerBlock; ++i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), kv[i].first); + iter->Next(); + } + ASSERT_OK(iter->status()); + iter->Seek(kv[70 * kEntriesPerBlock].first); + for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), kv[i].first); + iter->Next(); + } + ASSERT_OK(iter->status()); + + read_count_after = + options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + ASSERT_EQ(read_count_before + 2, read_count_after); + + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + // 3. Tests I/O excludes blocks already in cache. + // Reading blocks from 40-79 + // From reads above, blocks 40-44 and 70-74 already in cache + // So we should read 45-69, 75-79 in two I/Os. + // If fill_cache is false, then we'll do one giant I/O. + scan_options = MultiScanArgs(comparator_); + scan_options.use_async_io = use_async_io; + scan_options.insert(ExtractUserKey(kv[40 * kEntriesPerBlock].first), + ExtractUserKey(kv[80 * kEntriesPerBlock].first)); + read_count_before = + options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + iter->Prepare(&scan_options); + read_count_after = + options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + if (!use_async_io) { + if (!fill_cache) { + ASSERT_EQ(read_count_before + 1, read_count_after); + } else { + ASSERT_EQ(read_count_before + 2, read_count_after); + } + } else { + // stat is recorded in async callback which happens in Poll(), and + // Poll() happens during scanning. + ASSERT_EQ(read_count_before, read_count_after); + } + + iter->Seek(kv[40 * kEntriesPerBlock].first); + for (size_t i = 40 * kEntriesPerBlock; i < 80 * kEntriesPerBlock; ++i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), kv[i].first); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + read_count_after = + options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + if (!fill_cache) { + ASSERT_EQ(read_count_before + 1, read_count_after); + } else { + ASSERT_EQ(read_count_before + 2, read_count_after); + } + + // 4. Check cases when Seek key does not match start key in ScanOptions + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + scan_options = MultiScanArgs(comparator_); + scan_options.use_async_io = use_async_io; + scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first), + ExtractUserKey(kv[40 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first), + ExtractUserKey(kv[60 * kEntriesPerBlock].first)); + iter->Prepare(&scan_options); + // Match start key + iter->Seek(kv[30 * kEntriesPerBlock].first); + for (size_t i = 30 * kEntriesPerBlock; i < 40 * kEntriesPerBlock; ++i) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), kv[i].first); + iter->Next(); + } + ASSERT_OK(iter->status()); + + // Seek a key that is larger than next start key is allowed, as long as it is + // larger than the previous key + iter->Seek(kv[50 * kEntriesPerBlock + 1].first); + ASSERT_OK(iter->status()); + + // Check seek key going backward + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + scan_options = MultiScanArgs(comparator_); + scan_options.use_async_io = use_async_io; + scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first), + ExtractUserKey(kv[31 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[32 * kEntriesPerBlock].first), + ExtractUserKey(kv[33 * kEntriesPerBlock].first)); + iter->Prepare(&scan_options); + iter->Seek(kv[32 * kEntriesPerBlock].first); + auto key = iter->key(); + ASSERT_OK(iter->status()); + iter->Seek(kv[30 * kEntriesPerBlock].first); + // When seek key goes backward, it is adjusted to the last seeked position. + // Assert the key read is same as before. + ASSERT_EQ(key, iter->key()); + ASSERT_OK(iter->status()); + + // Test prefetch limit reached. + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + scan_options = MultiScanArgs(comparator_); + scan_options.use_async_io = use_async_io; + scan_options.max_prefetch_size = 1024; // less than block size + scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first), + ExtractUserKey(kv[40 * kEntriesPerBlock].first)); + iter->Prepare(&scan_options); + iter->Seek(kv[31 * kEntriesPerBlock].first); + ASSERT_TRUE(iter->status().IsIncomplete()); + + // Randomly seek keys on the file, as long as the key is moving forward, it + // is allowed + + if (use_async_io) { + // Skip following test when async io is enabled. There is some issue with + // IO_uring that I am still trying to root cause. + // TODO : enable the test again with async IO + return; + } + for (int i = 0; i < 100; i++) { + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + scan_options = MultiScanArgs(comparator_); + scan_options.use_async_io = use_async_io; + scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock].first), + ExtractUserKey(kv[10 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[25 * kEntriesPerBlock].first), + ExtractUserKey(kv[35 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[35 * kEntriesPerBlock].first), + ExtractUserKey(kv[40 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[45 * kEntriesPerBlock].first), + ExtractUserKey(kv[50 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[75 * kEntriesPerBlock].first), + ExtractUserKey(kv[85 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[85 * kEntriesPerBlock].first), + ExtractUserKey(kv[95 * kEntriesPerBlock].first)); + + iter->Prepare(&scan_options); + + auto random_seed = static_cast( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count()); + Random rnd(random_seed); + std::cout << random_seed << std::endl; + SCOPED_TRACE("Random seed " + std::to_string(random_seed)); + + // Search key always start from the start key of first prepared range. + int last_read_key_index = rnd.Uniform(100) + 5 * kEntriesPerBlock; + while (last_read_key_index < 100 * kEntriesPerBlock) { + iter->Seek(kv[last_read_key_index].first); + EXPECT_OK(iter->status()); + // iterate for a few keys + while (iter->Valid()) { + iter->Next(); + last_read_key_index++; + EXPECT_OK(iter->status()); + } + last_read_key_index += rnd.Uniform(100); + } + } +} + +TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanPrefetchSizeLimit) { + if (compression_type_ != kNoCompression) { + // This test relies on block sizes to be close to what's set in option. + ROCKSDB_GTEST_BYPASS("This test assumes no compression."); + return; + } + ReadOptions read_opts; + size_t ts_sz = options_.comparator->timestamp_size(); + + // Generate data that spans multiple blocks + std::vector> kv = + BlockBasedTableReaderBaseTest::GenerateKVMap( + 20 /* num_block */, true /* mixed_with_human_readable_string_value */, + ts_sz, same_key_diff_ts_, comparator_); + + std::string table_name = "BlockBasedTableReaderTest_PrefetchSizeLimit" + + CompressionTypeToString(compression_type_); + + ImmutableOptions ioptions(options_); + CreateTable(table_name, ioptions, compression_type_, kv, + compression_parallel_threads_, compression_dict_bytes_); + + std::unique_ptr table; + FileOptions foptions; + foptions.use_direct_reads = use_direct_reads_; + InternalKeyComparator comparator(options_.comparator); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table, + true /* bool prefetch_index_and_filter_in_cache */, + nullptr /* status */, persist_udt_); + + // Default block size is 4KB + // + // Tests when no block is loaded + { + std::unique_ptr iter; + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + MultiScanArgs scan_options(comparator_); + scan_options.max_prefetch_size = 1024; // less than block size + scan_options.insert(ExtractUserKey(kv[0].first), + ExtractUserKey(kv[5].first)); + + iter->Prepare(&scan_options); + + // Should be able to scan the first block, but not more + iter->Seek(kv[0].first); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsPrefetchLimitReached()); + } + + // Some blocks are loaded + { + std::unique_ptr iter; + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + MultiScanArgs scan_options(comparator_); + scan_options.max_prefetch_size = 9 * 1024; // 9KB - 2 blocks with buffer + scan_options.insert(ExtractUserKey(kv[1 * kEntriesPerBlock].first), + ExtractUserKey(kv[8 * kEntriesPerBlock].first)); + + iter->Prepare(&scan_options); + iter->Seek(kv[1 * kEntriesPerBlock].first); + size_t scanned_keys = 0; + + // Should be able to scan up to 2 blocks worth of data + while (iter->Valid()) { + ASSERT_EQ(iter->key().ToString(), + kv[scanned_keys + 1 * kEntriesPerBlock].first); + iter->Next(); + scanned_keys++; + } + + ASSERT_TRUE(iter->status().IsPrefetchLimitReached()); + ASSERT_EQ(scanned_keys, 2 * kEntriesPerBlock); + } + + // Tests with some block loaded in cache already: + // Blocks 1 and 2 are already in cache by the above test. + // Here we try blocks 0 - 5, with prefetch limit to 3 blocks, and expect to + // read 3 blocks. + { + std::unique_ptr iter; + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + MultiScanArgs scan_options(comparator_); + scan_options.max_prefetch_size = 3 * 4 * 1024 + 1024; // 3 blocks + 1KB + scan_options.insert(ExtractUserKey(kv[0].first), + ExtractUserKey(kv[5 * kEntriesPerBlock].first)); + + iter->Prepare(&scan_options); + iter->Seek(kv[0].first); + size_t scanned_keys = 0; + // Should only read 3 blocks (blocks 0, 1, 2) + // already cached. + while (iter->Valid()) { + ASSERT_EQ(iter->key().ToString(), kv[scanned_keys].first); + iter->Next(); + scanned_keys++; + } + ASSERT_TRUE(iter->status().IsPrefetchLimitReached()); + ASSERT_EQ(scanned_keys, 3 * kEntriesPerBlock); + } + + // Multiple scan ranges with prefetch limit + { + std::unique_ptr iter; + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + MultiScanArgs scan_options(comparator_); + scan_options.max_prefetch_size = 5 * 4 * 1024 + 1024; // 5 blocks + 1KB + // Will read 5 entries from first scan range, and 4 blocks from the second + // scan range + scan_options.insert(ExtractUserKey(kv[0].first), + ExtractUserKey(kv[5].first)); + scan_options.insert(ExtractUserKey(kv[12 * kEntriesPerBlock].first), + ExtractUserKey(kv[17 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[18 * kEntriesPerBlock].first), + ExtractUserKey(kv[19 * kEntriesPerBlock].first)); + + iter->Prepare(&scan_options); + + iter->Seek(kv[0].first); + size_t scanned_keys = 0; + size_t key_idx = 0; + while (iter->Valid()) { + ASSERT_EQ(iter->key().ToString(), kv[key_idx].first); + iter->Next(); + scanned_keys++; + key_idx++; + if (key_idx == 5) { + iter->Seek(kv[12 * kEntriesPerBlock].first); + key_idx = 12 * kEntriesPerBlock; + } + } + ASSERT_EQ(scanned_keys, 5 + 4 * kEntriesPerBlock); + ASSERT_TRUE(iter->status().IsPrefetchLimitReached()); + } + + // Prefetch limit is big enough for all scan ranges. + { + std::unique_ptr iter; + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + MultiScanArgs scan_options(comparator_); + scan_options.max_prefetch_size = 10 * 1024 * 1024; // 10MB + scan_options.insert(ExtractUserKey(kv[0].first), + ExtractUserKey(kv[5].first)); + scan_options.insert(ExtractUserKey(kv[8 * kEntriesPerBlock].first), + ExtractUserKey(kv[12 * kEntriesPerBlock].first)); + scan_options.insert(ExtractUserKey(kv[18 * kEntriesPerBlock].first), + ExtractUserKey(kv[19 * kEntriesPerBlock].first)); + + iter->Prepare(&scan_options); + + iter->Seek(kv[0].first); + size_t scanned_keys = 0; + size_t key_idx = 0; + // Scan first range + while (iter->Valid() && key_idx < 5) { + ASSERT_EQ(iter->key().ToString(), kv[key_idx].first); + iter->Next(); + scanned_keys++; + key_idx++; + } + // Move to second range + iter->Seek(kv[8 * kEntriesPerBlock].first); + key_idx = 8 * kEntriesPerBlock; + while (iter->Valid() && key_idx < 12 * kEntriesPerBlock) { + ASSERT_EQ(iter->key().ToString(), kv[key_idx].first); + iter->Next(); + scanned_keys++; + key_idx++; + } + // Move to third range + iter->Seek(kv[18 * kEntriesPerBlock].first); + key_idx = 18 * kEntriesPerBlock; + while (iter->Valid() && key_idx < 19 * kEntriesPerBlock) { + ASSERT_EQ(iter->key().ToString(), kv[key_idx].first); + iter->Next(); + scanned_keys++; + key_idx++; + } + // Should not hit prefetch limit + ASSERT_OK(iter->status()); + ASSERT_EQ(scanned_keys, 5 + 4 * kEntriesPerBlock + 1 * kEntriesPerBlock); + } +} + +TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanUnpinPreviousBlocks) { + std::vector> kv = + BlockBasedTableReaderBaseTest::GenerateKVMap( + 30 /* num_block */, true /* mixed_with_human_readable_string_value */, + comparator_->timestamp_size(), same_key_diff_ts_, comparator_); + std::string table_name = "BlockBasedTableReaderTest_UnpinPreviousBlocks" + + CompressionTypeToString(compression_type_); + ImmutableOptions ioptions(options_); + CreateTable(table_name, ioptions, compression_type_, kv, + compression_parallel_threads_, compression_dict_bytes_); + + std::unique_ptr table; + FileOptions foptions; + foptions.use_direct_reads = use_direct_reads_; + InternalKeyComparator comparator(options_.comparator); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table, + true /* bool prefetch_index_and_filter_in_cache */, + nullptr /* status */, persist_udt_); + + ReadOptions read_opts; + std::unique_ptr iter; + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + MultiScanArgs scan_options(BytewiseComparator()); + // Range 1: block 0-4, Range 2: block 4-4, Range 3: block 5-15 + scan_options.insert(ExtractUserKey(kv[0 * kEntriesPerBlock].first), + ExtractUserKey(kv[5 * kEntriesPerBlock - 5].first)); + scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock - 4].first), + ExtractUserKey(kv[5 * kEntriesPerBlock - 3].first)); + scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock - 2].first), + ExtractUserKey(kv[15 * kEntriesPerBlock - 1].first)); + + iter->Prepare(&scan_options); + auto* bbiter = dynamic_cast(iter.get()); + ASSERT_TRUE(bbiter); + for (int block = 0; block < 15; ++block) { + ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block; + } + + // MultiScan require seeks to be called in scan_option order + iter->Seek(kv[0 * kEntriesPerBlock].first); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + + // Seek to second range - should unpin blocks from first range + iter->Seek(kv[5 * kEntriesPerBlock - 4].first); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), kv[5 * kEntriesPerBlock - 4].first); + ASSERT_EQ(iter->value(), kv[5 * kEntriesPerBlock - 4].second); + + // The last block (block 4) is shared with the second range, so + // it's not unpinned yet. + for (int block = 0; block < 4; ++block) { + ASSERT_FALSE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block; + } + // Blocks from second range still in cache. + // We skip block 4 here since it's ownership is moved to the actual data + // block iter. + for (int block = 5; block < 15; ++block) { + ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block; + } + + iter->Seek(kv[5 * kEntriesPerBlock - 2].first); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), kv[5 * kEntriesPerBlock - 2].first); + ASSERT_EQ(iter->value(), kv[5 * kEntriesPerBlock - 2].second); + + // Still pinned + for (int block = 5; block < 15; ++block) { + ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block; + } +} + +// Test that fs_prefetch_support flag is correctly initialized during table +// construction based on filesystem capabilities +TEST_P(BlockBasedTableReaderTest, FSPrefetchSupportInitializedCorrectly) { + class ConfigurablePrefetchFS : public FileSystemWrapper { + public: + ConfigurablePrefetchFS(const std::shared_ptr& target, + bool support_prefetch) + : FileSystemWrapper(target), support_prefetch_(support_prefetch) {} + + static const char* kClassName() { return "ConfigurablePrefetchFS"; } + const char* Name() const override { return kClassName(); } + + void SupportedOps(int64_t& supported_ops) override { + target()->SupportedOps(supported_ops); + if (!support_prefetch_) { // Disable prefetch support if requested + supported_ops &= ~(1 << FSSupportedOps::kFSPrefetch); + } + } + + private: + bool support_prefetch_; + }; + + // Prepare test table + Options options; + options.persist_user_defined_timestamps = persist_udt_; + if (udt_enabled_) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } + size_t ts_sz = options.comparator->timestamp_size(); + auto kv = BlockBasedTableReaderBaseTest::GenerateKVMap(5, true, ts_sz); + std::string table_name = "BlockBasedTableReaderTest_BlockPrefetcherTest" + + CompressionTypeToString(compression_type_); + ImmutableOptions ioptions(options); + CreateTable(table_name, ioptions, compression_type_, kv, + compression_parallel_threads_, compression_dict_bytes_); + + // Test Case 1: Filesystem supports prefetch, fs_prefetch_support should be + // true + { + auto fs_with_prefetch = std::make_shared( + env_->GetFileSystem(), true /* support_prefetch */); + std::unique_ptr env_wrapper( + new CompositeEnvWrapper(env_, fs_with_prefetch)); + options.env = env_wrapper.get(); + + FileOptions fopts; + fopts.use_direct_reads = use_direct_reads_; + InternalKeyComparator cmp(options.comparator); + ImmutableOptions iopts(options); + + std::unique_ptr table; + NewBlockBasedTableReader(fopts, iopts, cmp, table_name, &table, + false /* prefetch_index_and_filter_in_cache */, + nullptr, persist_udt_); + + ASSERT_TRUE(table->get_rep()->fs_prefetch_support); + ASSERT_TRUE(CheckFSFeatureSupport(fs_with_prefetch.get(), + FSSupportedOps::kFSPrefetch)); + } + + // Test Case 2: Filesystem doesn't support prefetch, fs_prefetch_support + // should be false + { + auto fs_without_prefetch = std::make_shared( + env_->GetFileSystem(), false /* support_prefetch */); + std::unique_ptr env_wrapper( + new CompositeEnvWrapper(env_, fs_without_prefetch)); + options.env = env_wrapper.get(); + + FileOptions fopts; + fopts.use_direct_reads = use_direct_reads_; + InternalKeyComparator cmp(options.comparator); + ImmutableOptions iopts(options); + + std::unique_ptr table; + NewBlockBasedTableReader(fopts, iopts, cmp, table_name, &table, + false /* prefetch_index_and_filter_in_cache */, + nullptr, persist_udt_); + + ASSERT_FALSE(table->get_rep()->fs_prefetch_support); + ASSERT_FALSE(CheckFSFeatureSupport(fs_without_prefetch.get(), + FSSupportedOps::kFSPrefetch)); + } +} +std::vector GenerateCombinedParameters( + const std::vector& compression_types, + const std::vector& use_direct_read_flags, + const std::vector& index_types, + const std::vector& no_block_cache_flags, + const std::vector& udt_test_modes, + const std::vector& parallel_compression_thread_counts, + const std::vector& compression_dict_byte_counts, + const std::vector& same_key_diff_ts_flags, + const std::vector& comparators, + const std::vector& fill_cache_flags, + const std::vector& use_async_io_flags, + const std::vector& block_align_flags, + const std::vector& super_block_alignment_sizes, + const std::vector& super_block_alignment_space_overhead_ratios) { + std::vector params; + for (const auto& compression_type : compression_types) { + for (auto use_direct_read : use_direct_read_flags) { + for (const auto& index_type : index_types) { + for (auto no_block_cache : no_block_cache_flags) { + for (const auto& udt_test_mode : udt_test_modes) { + for (auto parallel_compression_thread_count : + parallel_compression_thread_counts) { + for (auto compression_dict_byte_count : + compression_dict_byte_counts) { + for (auto same_key_diff_ts_flag : same_key_diff_ts_flags) { + for (const auto& comparator : comparators) { + for (auto fill_cache : fill_cache_flags) { + for (auto use_async_io : use_async_io_flags) { + for (auto block_align : block_align_flags) { + for (auto super_block_alignment_size : + super_block_alignment_sizes) { + for ( + auto + super_block_alignment_space_overhead_ratio : + super_block_alignment_space_overhead_ratios) { + if (super_block_alignment_size == 0) { + // Override padding size to 0 if alignment size + // is 0, which means no super block alignment + super_block_alignment_space_overhead_ratio = 0; + } + params.emplace_back( + compression_type, use_direct_read, index_type, + no_block_cache, udt_test_mode, + parallel_compression_thread_count, + compression_dict_byte_count, + same_key_diff_ts_flag, comparator, fill_cache, + use_async_io, block_align, + super_block_alignment_size, + super_block_alignment_space_overhead_ratio); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + return params; +} + +std::vector Bool() { return {true, false}; } + +struct BlockBasedTableReaderTestParamBuilder { + BlockBasedTableReaderTestParamBuilder() { + // Default values + compression_types = GetSupportedCompressions(); + use_direct_read_flags = Bool(); + index_types = { + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}; + no_block_cache_flags = {false}; + udt_test_modes = { + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp}; + parallel_compression_thread_counts = {1, 2}; + compression_dict_byte_counts = {0, 4096}; + same_key_diff_ts_flags = {false}; + comparators = {BytewiseComparator()}; + fill_cache_flags = {true}; + use_async_io_flags = {false}; + block_align_flags = {false}; + super_block_alignment_sizes = {0}; + super_block_alignment_space_overhead_ratios = {128}; + } + + // builder methods for each member + BlockBasedTableReaderTestParamBuilder& WithCompressionTypes( + const std::vector& _compression_types) { + compression_types = _compression_types; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithUseDirectReadFlags( + const std::vector& _use_direct_read_flags) { + use_direct_read_flags = _use_direct_read_flags; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithIndexTypes( + const std::vector& _index_types) { + index_types = _index_types; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithNoBlockCacheFlags( + const std::vector& _no_block_cache_flags) { + no_block_cache_flags = _no_block_cache_flags; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithUDTTestModes( + const std::vector& _udt_test_modes) { + udt_test_modes = _udt_test_modes; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithParallelCompressionThreadCounts( + const std::vector& _parallel_compression_thread_counts) { + parallel_compression_thread_counts = _parallel_compression_thread_counts; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithCompressionDictByteCounts( + const std::vector& _compression_dict_byte_counts) { + compression_dict_byte_counts = _compression_dict_byte_counts; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithSameKeyDiffTsFlags( + const std::vector& _same_key_diff_ts_flags) { + same_key_diff_ts_flags = _same_key_diff_ts_flags; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithComparators( + const std::vector& _comparators) { + comparators = _comparators; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithFillCacheFlags( + const std::vector& _fill_cache_flags) { + fill_cache_flags = _fill_cache_flags; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithUseAsyncIoFlags( + const std::vector& _use_async_io_flags) { + use_async_io_flags = _use_async_io_flags; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithBlockAlignFlags( + const std::vector& _block_align_flags) { + block_align_flags = _block_align_flags; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& WithSuperBlockAlignmentSizes( + const std::vector& _super_block_alignment_sizes) { + super_block_alignment_sizes = _super_block_alignment_sizes; + return *this; + } + + BlockBasedTableReaderTestParamBuilder& + WithSuperBlockAlignmentSpaceOverheadRatios( + const std::vector& _super_block_alignment_space_overhead_ratios) { + super_block_alignment_space_overhead_ratios = + _super_block_alignment_space_overhead_ratios; + return *this; + } + + std::vector build() { + return GenerateCombinedParameters( + compression_types, use_direct_read_flags, index_types, + no_block_cache_flags, udt_test_modes, + parallel_compression_thread_counts, compression_dict_byte_counts, + same_key_diff_ts_flags, comparators, fill_cache_flags, + use_async_io_flags, block_align_flags, super_block_alignment_sizes, + super_block_alignment_space_overhead_ratios); + } + + std::vector compression_types; + std::vector use_direct_read_flags; + std::vector index_types; + std::vector no_block_cache_flags; + std::vector udt_test_modes; + std::vector parallel_compression_thread_counts; + std::vector compression_dict_byte_counts; + std::vector same_key_diff_ts_flags; + std::vector comparators; + std::vector fill_cache_flags; + std::vector use_async_io_flags; + std::vector block_align_flags; + std::vector super_block_alignment_sizes; + std::vector super_block_alignment_space_overhead_ratios; +}; + +std::vector IOUringFlags() { +#ifdef ROCKSDB_IOURING_PRESENT + return {false, true}; +#else + return {false}; +#endif +} + INSTANTIATE_TEST_CASE_P( BlockBasedTableReaderTest, BlockBasedTableReaderTest, - ::testing::Combine( - ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(), - ::testing::Values( - BlockBasedTableOptions::IndexType::kBinarySearch, - BlockBasedTableOptions::IndexType::kHashSearch, - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, - BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey), - ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()), - ::testing::Values(1, 2), ::testing::Values(0, 4096), - ::testing::Values(false))); + ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder() + .WithUDTTestModes(test::GetUDTTestModes()) + .build())); + +INSTANTIATE_TEST_CASE_P( + BlockBasedTableReaderMultiScanAsyncIOTest, + BlockBasedTableReaderMultiScanAsyncIOTest, + ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder() + .WithComparators({BytewiseComparator(), + ReverseBytewiseComparator()}) + .WithFillCacheFlags(Bool()) + .WithUseAsyncIoFlags(IOUringFlags()) + .build())); + +INSTANTIATE_TEST_CASE_P( + BlockBasedTableReaderMultiScanTest, BlockBasedTableReaderMultiScanTest, + ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder() + .WithComparators({BytewiseComparator(), + ReverseBytewiseComparator()}) + .build())); + INSTANTIATE_TEST_CASE_P( BlockBasedTableReaderGetTest, BlockBasedTableReaderGetTest, - ::testing::Combine( - ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(), - ::testing::Values( - BlockBasedTableOptions::IndexType::kBinarySearch, - BlockBasedTableOptions::IndexType::kHashSearch, - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, - BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey), - ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()), - ::testing::Values(1, 2), ::testing::Values(0, 4096), - ::testing::Values(false, true))); + ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder() + .WithUDTTestModes(test::GetUDTTestModes()) + .WithSameKeyDiffTsFlags(Bool()) + .WithComparators({BytewiseComparator(), + ReverseBytewiseComparator()}) + .WithFillCacheFlags({false}) + .build())); + +INSTANTIATE_TEST_CASE_P( + BlockBasedTableReaderSuperBlockAlignTest, BlockBasedTableReaderGetTest, + ::testing::ValuesIn( + BlockBasedTableReaderTestParamBuilder() + .WithIndexTypes( + {BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) + .WithFillCacheFlags({false}) + .WithBlockAlignFlags(Bool()) + .WithSuperBlockAlignmentSizes({0, 32 * 1024, 16 * 1024}) + .WithSuperBlockAlignmentSpaceOverheadRatios({0, 4, 256}) + .build())); + INSTANTIATE_TEST_CASE_P( StrictCapacityLimitReaderTest, StrictCapacityLimitReaderTest, - ::testing::Combine( - ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(), - ::testing::Values( - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch), - ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()), - ::testing::Values(1, 2), ::testing::Values(0), - ::testing::Values(false, true))); + ::testing::ValuesIn( + BlockBasedTableReaderTestParamBuilder() + .WithIndexTypes( + {BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) + .WithUDTTestModes(test::GetUDTTestModes()) + .WithCompressionDictByteCounts({0}) + .WithSameKeyDiffTsFlags(Bool()) + .WithFillCacheFlags({false}) + .build())); + INSTANTIATE_TEST_CASE_P( VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum, - ::testing::Combine( - ::testing::ValuesIn(GetSupportedCompressions()), - ::testing::Values(false), - ::testing::Values( - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch), - ::testing::Values(true), ::testing::ValuesIn(test::GetUDTTestModes()), - ::testing::Values(1, 2), ::testing::Values(0), - ::testing::Values(false))); - + ::testing::ValuesIn( + BlockBasedTableReaderTestParamBuilder() + .WithUseDirectReadFlags({false}) + .WithIndexTypes( + {BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) + .WithNoBlockCacheFlags({true}) + .WithUDTTestModes(test::GetUDTTestModes()) + .WithCompressionDictByteCounts({0}) + .WithFillCacheFlags({false}) + .build())); } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc index e4950e4356bf..541ff6ea23da 100644 --- a/table/block_based/block_builder.cc +++ b/table/block_based/block_builder.cc @@ -21,15 +21,19 @@ // An entry for a particular key-value pair has the form: // shared_bytes: varint32 // unshared_bytes: varint32 -// value_length: varint32 +// value_length: varint32 (NOTE1) // key_delta: char[unshared_bytes] // value: char[value_length] -// shared_bytes == 0 for restart points. +// shared_bytes == 0 (explicitly stored) for restart points. // // The trailer of the block has the form: // restarts: uint32[num_restarts] // num_restarts: uint32 // restarts[i] contains the offset within the block of the ith restart point. +// +// NOTE1: omitted for format_version >= 4 index blocks, because the value is +// composed of one (shared_bytes > 0) or two (shared_bytes == 0) varints, whose +// length is self-describing. #include "table/block_based/block_builder.h" @@ -129,29 +133,28 @@ Slice BlockBuilder::Finish() { PutFixed32(&buffer_, restarts_[i]); } - uint32_t num_restarts = static_cast(restarts_.size()); - BlockBasedTableOptions::DataBlockIndexType index_type = - BlockBasedTableOptions::kDataBlockBinarySearch; + DataBlockFooter footer; + footer.num_restarts = static_cast(restarts_.size()); + footer.index_type = BlockBasedTableOptions::kDataBlockBinarySearch; if (data_block_hash_index_builder_.Valid() && CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) { data_block_hash_index_builder_.Finish(buffer_); - index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + footer.index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; } - // footer is a packed format of data_block_index_type and num_restarts - uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts); - - PutFixed32(&buffer_, block_footer); + footer.EncodeTo(&buffer_); finished_ = true; return Slice(buffer_); } void BlockBuilder::Add(const Slice& key, const Slice& value, - const Slice* const delta_value) { + const Slice* const delta_value, + bool skip_delta_encoding) { // Ensure no unsafe mixing of Add and AddWithLastKey assert(!add_with_last_key_called_); - AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size()); + AddWithLastKeyImpl(key, value, last_key_, delta_value, skip_delta_encoding, + buffer_.size()); if (use_delta_encoding_) { // Update state // We used to just copy the changed data, but it appears to be @@ -162,7 +165,8 @@ void BlockBuilder::Add(const Slice& key, const Slice& value, void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value, const Slice& last_key_param, - const Slice* const delta_value) { + const Slice* const delta_value, + bool skip_delta_encoding) { // Ensure no unsafe mixing of Add and AddWithLastKey assert(last_key_.empty()); #ifndef NDEBUG @@ -181,17 +185,18 @@ void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value, Slice last_key(last_key_param.data(), last_key_size * (buffer_size > 0)); - AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size); + AddWithLastKeyImpl(key, value, last_key, delta_value, skip_delta_encoding, + buffer_size); } inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, const Slice& value, const Slice& last_key, const Slice* const delta_value, + bool skip_delta_encoding, size_t buffer_size) { assert(!finished_); assert(counter_ <= block_restart_interval_); - assert(!use_value_delta_encoding_ || delta_value); std::string key_buf; std::string last_key_buf; const Slice key_to_persist = MaybeStripTimestampFromKey(&key_buf, key); @@ -207,7 +212,7 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, restarts_.push_back(static_cast(buffer_size)); estimate_ += sizeof(uint32_t); counter_ = 0; - } else if (use_delta_encoding_) { + } else if (use_delta_encoding_ && !skip_delta_encoding) { // See how much sharing to do with previous string shared = key_to_persist.difference_offset(last_key_persisted); } @@ -231,6 +236,7 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, // simplify the decoding, where it can figure which decoding to use simply by // looking at the shared bytes size. if (shared != 0 && use_value_delta_encoding_) { + assert(delta_value != nullptr); buffer_.append(delta_value->data(), delta_value->size()); } else { buffer_.append(value.data(), value.size()); diff --git a/table/block_based/block_builder.h b/table/block_based/block_builder.h index f167470bb5f5..6cc9d836ab31 100644 --- a/table/block_based/block_builder.h +++ b/table/block_based/block_builder.h @@ -46,7 +46,8 @@ class BlockBuilder { // AddWithLastKey() in contexts where previous added key is already known // and delta encoding might be used. void Add(const Slice& key, const Slice& value, - const Slice* const delta_value = nullptr); + const Slice* const delta_value = nullptr, + bool skip_delta_encoding = false); // A faster version of Add() if the previous key is already known for all // Add()s. @@ -59,7 +60,8 @@ class BlockBuilder { // DO NOT mix with Add() between Resets. void AddWithLastKey(const Slice& key, const Slice& value, const Slice& last_key, - const Slice* const delta_value = nullptr); + const Slice* const delta_value = nullptr, + bool skip_delta_encoding = false); // Finish building the block and return a slice that refers to the // block contents. The returned slice will remain valid for the @@ -80,11 +82,13 @@ class BlockBuilder { // Return true iff no entries have been added since the last Reset() bool empty() const { return buffer_.empty(); } + std::string& MutableBuffer() { return buffer_; } + private: inline void AddWithLastKeyImpl(const Slice& key, const Slice& value, const Slice& last_key, const Slice* const delta_value, - size_t buffer_size); + bool skip_delta_encoding, size_t buffer_size); inline const Slice MaybeStripTimestampFromKey(std::string* key_buf, const Slice& key); diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc index 08f5d2158dc5..28d181db5652 100644 --- a/table/block_based/block_cache.cc +++ b/table/block_based/block_cache.cc @@ -46,16 +46,22 @@ void BlockCreateContext::Create(std::unique_ptr* parsed_out, protection_bytes_per_key); } +void BlockCreateContext::Create( + std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kUserDefinedIndex(std::move(block))); +} + void BlockCreateContext::Create( std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new ParsedFullFilterBlock( table_options->filter_policy.get(), std::move(block))); } -void BlockCreateContext::Create(std::unique_ptr* parsed_out, +void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { - parsed_out->reset(new UncompressionDict( - block.data, std::move(block.allocation), using_zstd)); + parsed_out->reset(new DecompressorDict( + block.data, std::move(block.allocation), *decompressor)); } namespace { @@ -69,7 +75,7 @@ const std::array::GetFullHelper(), BlockCacheInterface::GetFullHelper(), nullptr, // kProperties - BlockCacheInterface::GetFullHelper(), + BlockCacheInterface::GetFullHelper(), BlockCacheInterface::GetFullHelper(), nullptr, // kHashIndexPrefixes nullptr, // kHashIndexMetadata @@ -86,7 +92,7 @@ const std::array::GetBasicHelper(), BlockCacheInterface::GetBasicHelper(), nullptr, // kProperties - BlockCacheInterface::GetBasicHelper(), + BlockCacheInterface::GetBasicHelper(), BlockCacheInterface::GetBasicHelper(), nullptr, // kHashIndexPrefixes nullptr, // kHashIndexMetadata diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h index d48a88f07137..564dcf0062db 100644 --- a/table/block_based/block_cache.h +++ b/table/block_based/block_cache.h @@ -67,19 +67,30 @@ class Block_kMetaIndex : public Block { static constexpr BlockType kBlockType = BlockType::kMetaIndex; }; +class Block_kUserDefinedIndex : public BlockContents { + public: + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kIndexBlock; + static constexpr BlockType kBlockType = BlockType::kUserDefinedIndex; + + explicit Block_kUserDefinedIndex(BlockContents&& other) + : BlockContents(std::move(other)) {} + const Slice& ContentSlice() const { return data; } +}; + struct BlockCreateContext : public Cache::CreateContext { BlockCreateContext() {} BlockCreateContext(const BlockBasedTableOptions* _table_options, const ImmutableOptions* _ioptions, Statistics* _statistics, - bool _using_zstd, uint8_t _protection_bytes_per_key, + Decompressor* _decompressor, + uint8_t _protection_bytes_per_key, const Comparator* _raw_ucmp, bool _index_value_is_full = false, bool _index_has_first_key = false) : table_options(_table_options), ioptions(_ioptions), statistics(_statistics), + decompressor(_decompressor), raw_ucmp(_raw_ucmp), - using_zstd(_using_zstd), protection_bytes_per_key(_protection_bytes_per_key), index_value_is_full(_index_value_is_full), index_has_first_key(_index_has_first_key) {} @@ -87,10 +98,9 @@ struct BlockCreateContext : public Cache::CreateContext { const BlockBasedTableOptions* table_options = nullptr; const ImmutableOptions* ioptions = nullptr; Statistics* statistics = nullptr; + // TODO: refactor to avoid copying BlockCreateContext for dict in block cache + Decompressor* decompressor = nullptr; const Comparator* raw_ucmp = nullptr; - const UncompressionDict* dict = nullptr; - uint32_t format_version; - bool using_zstd = false; uint8_t protection_bytes_per_key = 0; bool index_value_is_full; bool index_has_first_key; @@ -102,12 +112,10 @@ struct BlockCreateContext : public Cache::CreateContext { CompressionType type, MemoryAllocator* alloc) { BlockContents uncompressed_block_contents; if (type != CompressionType::kNoCompression) { - assert(dict != nullptr); - UncompressionContext context(type); - UncompressionInfo info(context, *dict, type); - Status s = UncompressBlockData( - info, data.data(), data.size(), &uncompressed_block_contents, - table_options->format_version, *ioptions, alloc); + assert(decompressor != nullptr); + Status s = + DecompressBlockData(data.data(), data.size(), type, *decompressor, + &uncompressed_block_contents, *ioptions, alloc); if (!s.ok()) { parsed_out->reset(); return; @@ -128,9 +136,11 @@ struct BlockCreateContext : public Cache::CreateContext { BlockContents&& block); void Create(std::unique_ptr* parsed_out, BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); void Create(std::unique_ptr* parsed_out, BlockContents&& block); - void Create(std::unique_ptr* parsed_out, + void Create(std::unique_ptr* parsed_out, BlockContents&& block); }; diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc index 52f0ef8fdfc2..bcebf5d36db0 100644 --- a/table/block_based/block_prefetcher.cc +++ b/table/block_based/block_prefetcher.cc @@ -39,16 +39,21 @@ void BlockPrefetcher::PrefetchIfNeeded( return; } IOOptions opts; - Status s = rep->file->PrepareIOOptions(read_options, opts); + IODebugContext dbg; + Status s = rep->file->PrepareIOOptions(read_options, opts, &dbg); if (!s.ok()) { return; } - s = rep->file->Prefetch(opts, offset, len + compaction_readahead_size_); - if (s.ok()) { - readahead_limit_ = offset + len + compaction_readahead_size_; - return; - } else if (!s.IsNotSupported()) { - return; + if (rep->fs_prefetch_support) { + s = rep->file->Prefetch(opts, offset, len + compaction_readahead_size_); + if (s.ok()) { + readahead_limit_ = offset + len + compaction_readahead_size_; + return; + } else if (!s.IsNotSupported()) { + return; + } + // If FS prefetch returned NotSupported despite feature bit being set, + // fall through to use internal prefetch buffer. } } // If FS prefetch is not supported, fall back to use internal prefetch @@ -58,9 +63,10 @@ void BlockPrefetcher::PrefetchIfNeeded( // implicit_auto_readahead is set. readahead_params.initial_readahead_size = compaction_readahead_size_; readahead_params.max_readahead_size = compaction_readahead_size_; - rep->CreateFilePrefetchBufferIfNotExists(readahead_params, - &prefetch_buffer_, - /*readaheadsize_cb=*/nullptr); + rep->CreateFilePrefetchBufferIfNotExists( + readahead_params, &prefetch_buffer_, + /*readaheadsize_cb=*/nullptr, + /*usage=*/FilePrefetchBufferUsage::kCompactionPrefetch); return; } @@ -140,19 +146,23 @@ void BlockPrefetcher::PrefetchIfNeeded( if (!s.ok()) { return; } - s = rep->file->Prefetch( - opts, handle.offset(), - BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_); - if (s.IsNotSupported()) { - rep->CreateFilePrefetchBufferIfNotExists( - readahead_params, &prefetch_buffer_, readaheadsize_cb, - /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); - return; - } - readahead_limit_ = offset + len + readahead_size_; - // Keep exponentially increasing readahead size until - // max_auto_readahead_size. - readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2); + if (rep->fs_prefetch_support) { + s = rep->file->Prefetch( + opts, handle.offset(), + BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_); + if (s.ok()) { + readahead_limit_ = offset + len + readahead_size_; + // Keep exponentially increasing readahead size until + // max_auto_readahead_size. + readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2); + return; + } + } + // If FS prefetch is not supported or returned NotSupported, fall back to use + // internal prefetch buffer. + rep->CreateFilePrefetchBufferIfNotExists( + readahead_params, &prefetch_buffer_, readaheadsize_cb, + /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index b1a855263daa..49bec09084f6 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -33,10 +33,10 @@ namespace ROCKSDB_NAMESPACE { std::string GenerateInternalKey(int primary_key, int secondary_key, - int padding_size, Random *rnd, + int padding_size, Random* rnd, size_t ts_sz = 0) { char buf[50]; - char *p = &buf[0]; + char* p = &buf[0]; snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); std::string k(p); if (padding_size) { @@ -55,8 +55,8 @@ std::string GenerateInternalKey(int primary_key, int secondary_key, // Generate random key value pairs. // The generated key will be sorted. You can tune the parameters to generated // different kinds of test key/value pairs for different scenario. -void GenerateRandomKVs(std::vector *keys, - std::vector *values, const int from, +void GenerateRandomKVs(std::vector* keys, + std::vector* values, const int from, const int len, const int step = 1, const int padding_size = 0, const int keys_share_prefix = 1, size_t ts_sz = 0) { @@ -133,7 +133,7 @@ TEST_P(BlockTest, SimpleTest) { // read contents of block sequentially int count = 0; - InternalIterator *iter = reader.NewDataIterator( + InternalIterator* iter = reader.NewDataIterator( options.comparator, kDisableGlobalSequenceNumber, nullptr /* iter */, nullptr /* stats */, false /* block_contents_pinned */, shouldPersistUDT()); @@ -169,9 +169,9 @@ TEST_P(BlockTest, SimpleTest) { // return the block contents BlockContents GetBlockContents( - std::unique_ptr *builder, - const std::vector &keys, - const std::vector &values, bool key_use_delta_encoding, + std::unique_ptr* builder, + const std::vector& keys, + const std::vector& values, bool key_use_delta_encoding, size_t ts_sz, bool should_persist_udt, const int /*prefix_group_size*/ = 1, BlockBasedTableOptions::DataBlockIndexType dblock_index_type = BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch) { @@ -194,8 +194,8 @@ BlockContents GetBlockContents( } void CheckBlockContents(BlockContents contents, const int max_key, - const std::vector &keys, - const std::vector &values, + const std::vector& keys, + const std::vector& values, bool is_udt_enabled, bool should_persist_udt) { const size_t prefix_size = 6; // create block reader @@ -356,8 +356,8 @@ class BlockReadAmpBitmapSlowAndAccurate { TEST_F(BlockTest, BlockReadAmpBitmap) { uint32_t pin_offset = 0; SyncPoint::GetInstance()->SetCallBack( - "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) { - pin_offset = *(static_cast(arg)); + "BlockReadAmpBitmap:rnd", [&pin_offset](void* arg) { + pin_offset = *(static_cast(arg)); }); SyncPoint::GetInstance()->EnableProcessing(); std::vector block_sizes = { @@ -414,7 +414,7 @@ TEST_F(BlockTest, BlockReadAmpBitmap) { for (size_t i = 0; i < random_entries.size(); i++) { read_amp_slow_and_accurate.ResetCheckSequence(); - auto ¤t_entry = random_entries[rnd.Next() % random_entries.size()]; + auto& current_entry = random_entries[rnd.Next() % random_entries.size()]; read_amp_bitmap.Mark(static_cast(current_entry.first), static_cast(current_entry.second)); @@ -465,7 +465,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { // read contents of block sequentially size_t read_bytes = 0; - DataBlockIter *iter = reader.NewDataIterator( + DataBlockIter* iter = reader.NewDataIterator( options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { iter->value(); @@ -496,7 +496,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { Block reader(std::move(contents), kBytesPerBit, stats.get()); size_t read_bytes = 0; - DataBlockIter *iter = reader.NewDataIterator( + DataBlockIter* iter = reader.NewDataIterator( options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); for (int i = 0; i < num_records; i++) { Slice k(keys[i]); @@ -530,7 +530,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { Block reader(std::move(contents), kBytesPerBit, stats.get()); size_t read_bytes = 0; - DataBlockIter *iter = reader.NewDataIterator( + DataBlockIter* iter = reader.NewDataIterator( options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); std::unordered_set read_keys; for (int i = 0; i < num_records; i++) { @@ -576,10 +576,29 @@ TEST_F(BlockTest, ReadAmpBitmapPow2) { ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u); } +void AddIndexBlockEntry(BlockBuilder& builder, const Slice& key, + const BlockHandle& bh, const BlockHandle* prev, + bool include_first_key, + const Slice& first_internal_key = Slice()) { + IndexValue entry(bh, first_internal_key); + std::string encoded_entry; + entry.EncodeTo(&encoded_entry, include_first_key, nullptr); + std::string delta_encoded_entry; + if (prev) { + entry.EncodeTo(&delta_encoded_entry, include_first_key, prev); + } + const Slice delta_slice(delta_encoded_entry); + builder.Add(key, encoded_entry, &delta_slice); +} + +enum class KeyDistribution { kUniform, kNonUniform }; + class IndexBlockTest : public testing::Test, public testing::WithParamInterface< - std::tuple> { + std::tuple> { public: IndexBlockTest() = default; @@ -592,25 +611,52 @@ class IndexBlockTest bool shouldPersistUDT() const { return test::ShouldPersistUDT(std::get<3>(GetParam())); } + BlockBasedTableOptions::BlockSearchType indexSearchType() const { + return isUDTEnabled() ? BlockBasedTableOptions::kBinary + : std::get<4>(GetParam()); + } + int numRecords() const { + return std::min(1 << keyLength(), std::get<5>(GetParam())); + } + int indexBlockRestartInterval() const { return std::get<6>(GetParam()); } + int keyLength() const { return std::get<7>(GetParam()); } + int prefixLength() const { return std::get<8>(GetParam()); } + KeyDistribution keyDistribution() const { return std::get<9>(GetParam()); } }; -// Similar to GenerateRandomKVs but for index block contents. -void GenerateRandomIndexEntries(std::vector *separators, - std::vector *block_handles, - std::vector *first_keys, - const int len, size_t ts_sz = 0, - bool zero_seqno = false) { +// Similar to GenerateRandomKVs but for index block contents. Keys always +// contain a 0-sequence number, callers may extract the user key if needed. +void GenerateRandomIndexEntries( + std::vector* separators, + std::vector* block_handles, + std::vector* first_keys, const int len, size_t ts_sz = 0, + int key_length = 12, int prefix_length = 0, + KeyDistribution distribution = KeyDistribution::kUniform) { Random rnd(42); + std::string prefix(prefix_length, 'x'); // For each of `len` blocks, we need to generate a first and last key. - // Let's generate n*2 random keys, sort them, group into consecutive pairs. + // Generate n*2 random keys, sort them, group into consecutive pairs. std::set keys; + + // Two clusters with shared prefixes of effective_key_length - 2. This + // stresses interpolation search's uniform distribution assumption. + int cluster_prefix_len = std::max(0, key_length - 5); + std::string cluster1_prefix = prefix + rnd.RandomString(cluster_prefix_len); + std::string cluster2_prefix = prefix + rnd.RandomString(cluster_prefix_len); + while ((int)keys.size() < len * 2) { - // Keys need to be at least 8 bytes long to look like internal keys. - std::string new_key = test::RandomKey(&rnd, 12); - if (zero_seqno) { - AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue); + std::string new_key; + if (distribution == KeyDistribution::kNonUniform) { + int remaining = key_length - cluster_prefix_len; + const std::string& cp = + (keys.size() % 2 == 0) ? cluster1_prefix : cluster2_prefix; + new_key = cp + rnd.RandomString(std::max(1, remaining)); + } else { + new_key = prefix + test::RandomKey(&rnd, key_length); } + + AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue); if (ts_sz > 0) { std::string key; PadInternalKeyWithMinTimestamp(&key, new_key, ts_sz); @@ -643,15 +689,17 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { std::vector block_handles; std::vector first_keys; const bool kUseDeltaEncoding = true; - BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding(), + BlockBuilder builder(indexBlockRestartInterval(), kUseDeltaEncoding, + useValueDeltaEncoding(), BlockBasedTableOptions::kDataBlockBinarySearch, 0.75 /* data_block_hash_table_util_ratio */, ts_sz, shouldPersistUDT(), !keyIncludesSeq()); - int num_records = 100; + int num_records = numRecords(); GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, - num_records, ts_sz, false /* zero_seqno */); + num_records, ts_sz, keyLength(), prefixLength(), + keyDistribution()); BlockHandle last_encoded_handle; for (int i = 0; i < num_records; i++) { std::string first_key_to_persist_buf; @@ -661,23 +709,13 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { ts_sz); first_internal_key = first_key_to_persist_buf; } - IndexValue entry(block_handles[i], first_internal_key); - std::string encoded_entry; - std::string delta_encoded_entry; - entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr); - if (useValueDeltaEncoding() && i > 0) { - entry.EncodeTo(&delta_encoded_entry, includeFirstKey(), - &last_encoded_handle); - } - last_encoded_handle = entry.handle; - const Slice delta_encoded_entry_slice(delta_encoded_entry); - - if (keyIncludesSeq()) { - builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice); - } else { - const Slice user_key = ExtractUserKey(separators[i]); - builder.Add(user_key, encoded_entry, &delta_encoded_entry_slice); - } + const BlockHandle* prev = + (useValueDeltaEncoding() && i > 0) ? &last_encoded_handle : nullptr; + Slice add_key = + keyIncludesSeq() ? Slice(separators[i]) : ExtractUserKey(separators[i]); + AddIndexBlockEntry(builder, add_key, block_handles[i], prev, + includeFirstKey(), first_internal_key); + last_encoded_handle = block_handles[i]; } // read serialized contents of the block @@ -689,14 +727,14 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { Block reader(std::move(contents)); const bool kTotalOrderSeek = true; - IndexBlockIter *kNullIter = nullptr; - Statistics *kNullStats = nullptr; + IndexBlockIter* kNullIter = nullptr; + Statistics* kNullStats = nullptr; // read contents of block sequentially - InternalIteratorBase *iter = reader.NewIndexIterator( + InternalIteratorBase* iter = reader.NewIndexIterator( options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats, kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(), !useValueDeltaEncoding(), false /* block_contents_pinned */, - shouldPersistUDT()); + shouldPersistUDT(), nullptr /* prefix_index */, indexSearchType()); iter->SeekToFirst(); for (int index = 0; index < num_records; ++index) { ASSERT_TRUE(iter->Valid()); @@ -724,7 +762,7 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats, kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(), !useValueDeltaEncoding(), false /* block_contents_pinned */, - shouldPersistUDT()); + shouldPersistUDT(), nullptr /* prefix_index */, indexSearchType()); for (int i = 0; i < num_records * 2; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -753,10 +791,205 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { // Param 1: use value delta encoding // Param 2: include first key // Param 3: user-defined timestamp test mode +// Param 4: index search type (binary search or interpolation search) +// Param 5: number of records +// Param 6: index block restart interval +// Param 7: key length +// Param 8: prefix length +// Param 9: key distribution (uniform or non-uniform) INSTANTIATE_TEST_CASE_P( P, IndexBlockTest, - ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(), - ::testing::ValuesIn(test::GetUDTTestModes()))); + ::testing::Combine( + ::testing::Bool(), ::testing::Bool(), ::testing::Bool(), + ::testing::ValuesIn(test::GetUDTTestModes()), + ::testing::Values( + BlockBasedTableOptions::BlockSearchType::kBinary, + BlockBasedTableOptions::BlockSearchType::kInterpolation), + ::testing::Values(1, 100), // num_records + ::testing::Values(1, 16), // index_block_restart_interval + ::testing::Values(1, 8, 12), // key_length + ::testing::Values(0, 50), // prefix_length + ::testing::Values(KeyDistribution::kUniform, + KeyDistribution::kNonUniform))); + +TEST(IndexBlockTest, InterpolationSearchPrefixBoundary) { + const bool kIncludeFirstKey = false; + const bool kUseValueDeltaEncoding = true; + const uint64_t kBlockSize = 50; + + // 20 user keys sharing prefix "ABCDEFGHIJ" with evenly spaced suffixes. + const std::string kPrefix = "ABCDEFGHIJ"; + const int kNumKeys = 20; + std::vector keys; + keys.reserve(kNumKeys); + for (int i = 0; i < kNumKeys; i++) { + std::string suffix = std::to_string(i); + char formatted_suffix[4]; + snprintf(formatted_suffix, sizeof(formatted_suffix), "%03d", i); + keys.push_back(kPrefix + formatted_suffix); + } + + std::vector handles; + handles.reserve(kNumKeys); + for (int i = 0; i < kNumKeys; i++) { + handles.emplace_back(i * (kBlockSize + BlockBasedTable::kBlockTrailerSize), + kBlockSize); + } + + BlockBuilder builder( + 1 /* restart_interval */, true /* use_delta_encoding */, + kUseValueDeltaEncoding, BlockBasedTableOptions::kDataBlockBinarySearch, + 0.75 /* data_block_hash_table_util_ratio */, 0 /* ts_sz */, + false /* persist_udt */, true /* is_user_key */); + + for (int i = 0; i < kNumKeys; i++) { + BlockHandle* prev = i > 0 ? &handles[i - 1] : nullptr; + AddIndexBlockEntry(builder, keys[i], handles[i], prev, kIncludeFirstKey); + } + + Slice rawblock = builder.Finish(); + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents)); + + // Seek targets must be internal keys since SeekImpl calls ExtractUserKey(). + auto make_target = [](const std::string& user_key) { + std::string target = user_key; + AppendInternalKeyFooter(&target, kMaxSequenceNumber, kValueTypeForSeek); + return target; + }; + + std::unique_ptr> iter( + reader.NewIndexIterator( + BytewiseComparator(), kDisableGlobalSequenceNumber, + nullptr /* iter */, nullptr /* stats */, true /* total_order_seek */, + kIncludeFirstKey, false /* key_includes_seq */, + !kUseValueDeltaEncoding /* value_is_full */, + false /* block_contents_pinned */, + true /* user_defined_timestamps_persisted */, + nullptr /* prefix_index */, + BlockBasedTableOptions::BlockSearchType::kInterpolation)); + + // Case 1: target prefix < shared prefix + iter->Seek(make_target("AAAAAA")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[0]); + + iter->Seek(make_target("")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[0]); + + // Case 2: target prefix > shared prefix + iter->Seek(make_target("ABCDEFGHZZ")); + ASSERT_FALSE(iter->Valid()); + + // Case 3: target is the prefix + iter->Seek(make_target("ABCDEFGHIJ")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[0]); + + // Case 4: target a subset of the prefix + iter->Seek(make_target("ABCDEFG")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[0]); +} + +// Like the above test, but extend the shared prefix into internal bytes +TEST(IndexBlockTest, InterpolationSearchPrefixBoundary2) { + const bool kIncludeFirstKey = false; + const bool kUseValueDeltaEncoding = true; + const uint64_t kBlockSize = 50; + + // 20 internal keys with the same user key but decreasing sequence numbers + // (which is ascending InternalKeyComparator order). + const std::string kUserKey = "ABCDEFGHIJ"; + const int kNumKeys = 20; + std::vector keys; + keys.reserve(kNumKeys); + for (int i = 0; i < kNumKeys; i++) { + std::string ikey = kUserKey; + SequenceNumber seq = static_cast(kNumKeys - i); + AppendInternalKeyFooter(&ikey, seq, kTypeValue); + keys.push_back(ikey); + } + + std::vector handles; + handles.reserve(kNumKeys); + for (int i = 0; i < kNumKeys; i++) { + handles.emplace_back(i * (kBlockSize + BlockBasedTable::kBlockTrailerSize), + kBlockSize); + } + + BlockBuilder builder( + 1 /* restart_interval */, true /* use_delta_encoding */, + kUseValueDeltaEncoding, BlockBasedTableOptions::kDataBlockBinarySearch, + 0.75 /* data_block_hash_table_util_ratio */, 0 /* ts_sz */, + false /* persist_udt */, false /* is_user_key */); + + for (int i = 0; i < kNumKeys; i++) { + BlockHandle* prev = i > 0 ? &handles[i - 1] : nullptr; + AddIndexBlockEntry(builder, keys[i], handles[i], prev, kIncludeFirstKey); + } + + Slice rawblock = builder.Finish(); + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents)); + + auto make_target = [&](const std::string& user_key, + SequenceNumber seq = kMaxSequenceNumber) { + std::string target = user_key; + AppendInternalKeyFooter(&target, seq, kTypeValue); + return target; + }; + + std::unique_ptr> iter( + reader.NewIndexIterator( + BytewiseComparator(), kDisableGlobalSequenceNumber, + nullptr /* iter */, nullptr /* stats */, true /* total_order_seek */, + kIncludeFirstKey, true /* key_includes_seq */, + !kUseValueDeltaEncoding /* value_is_full */, + false /* block_contents_pinned */, + true /* user_defined_timestamps_persisted */, + nullptr /* prefix_index */, + BlockBasedTableOptions::BlockSearchType::kInterpolation)); + + // Seek to each existing sequence number + for (int i = 0; i < kNumKeys; i++) { + SequenceNumber seq = static_cast(kNumKeys - i); + iter->Seek(make_target(kUserKey, seq)); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[i]); + } + + // Case 1: target prefix < shared prefix + iter->Seek(make_target("AAAAAA")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[0]); + + iter->Seek(make_target("")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[0]); + + // Case 2: target prefix > shared prefix + iter->Seek(make_target("ABCDEFGHZZ")); + ASSERT_FALSE(iter->Valid()); + + // Case 3: target has the same user key with kMaxSequenceNumber + iter->Seek(make_target("ABCDEFGHIJ")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[0]); + + // Case 4: target a subset of the prefix + iter->Seek(make_target("ABCDEFG")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(iter->key(), keys[0]); + + // Case 5: target key is a prefix that also extends into the internal bytes + // footer + iter->Seek(make_target("ABCDEFGHIJ" + std::string(1, kTypeValue))); + ASSERT_FALSE(iter->Valid()); +} class BlockPerKVChecksumTest : public DBTestBase { public: @@ -764,8 +997,8 @@ class BlockPerKVChecksumTest : public DBTestBase { : DBTestBase("block_per_kv_checksum", /*env_do_fsync=*/false) {} template - void TestIterateForward(std::unique_ptr &biter, - size_t &verification_count) { + void TestIterateForward(std::unique_ptr& biter, + size_t& verification_count) { while (biter->Valid()) { verification_count = 0; biter->Next(); @@ -776,8 +1009,8 @@ class BlockPerKVChecksumTest : public DBTestBase { } template - void TestIterateBackward(std::unique_ptr &biter, - size_t &verification_count) { + void TestIterateBackward(std::unique_ptr& biter, + size_t& verification_count) { while (biter->Valid()) { verification_count = 0; biter->Prev(); @@ -788,8 +1021,8 @@ class BlockPerKVChecksumTest : public DBTestBase { } template - void TestSeekToFirst(std::unique_ptr &biter, - size_t &verification_count) { + void TestSeekToFirst(std::unique_ptr& biter, + size_t& verification_count) { verification_count = 0; biter->SeekToFirst(); ASSERT_GE(verification_count, 1); @@ -797,8 +1030,8 @@ class BlockPerKVChecksumTest : public DBTestBase { } template - void TestSeekToLast(std::unique_ptr &biter, - size_t &verification_count) { + void TestSeekToLast(std::unique_ptr& biter, + size_t& verification_count) { verification_count = 0; biter->SeekToLast(); ASSERT_GE(verification_count, 1); @@ -806,8 +1039,8 @@ class BlockPerKVChecksumTest : public DBTestBase { } template - void TestSeekForPrev(std::unique_ptr &biter, - size_t &verification_count, std::string k) { + void TestSeekForPrev(std::unique_ptr& biter, + size_t& verification_count, const std::string& k) { verification_count = 0; biter->SeekForPrev(k); ASSERT_GE(verification_count, 1); @@ -815,16 +1048,16 @@ class BlockPerKVChecksumTest : public DBTestBase { } template - void TestSeek(std::unique_ptr &biter, size_t &verification_count, - std::string k) { + void TestSeek(std::unique_ptr& biter, size_t& verification_count, + const std::string& k) { verification_count = 0; biter->Seek(k); ASSERT_GE(verification_count, 1); TestIterateForward(biter, verification_count); } - bool VerifyChecksum(uint32_t checksum_len, const char *checksum_ptr, - const Slice &key, const Slice &val) { + bool VerifyChecksum(uint32_t checksum_len, const char* checksum_ptr, + const Slice& key, const Slice& val) { if (!checksum_len) { return checksum_ptr == nullptr; } @@ -833,6 +1066,18 @@ class BlockPerKVChecksumTest : public DBTestBase { } }; +namespace { +const BlockBasedTableOptions* kTableOptions() { + static BlockBasedTableOptions opts{}; + return &opts; +} +Decompressor* kDecompressor() { + static auto mgr = GetBuiltinV2CompressionManager(); + static auto decomp = mgr->GetDecompressor(); + return decomp.get(); +} +} // namespace + TEST_F(BlockPerKVChecksumTest, EmptyBlock) { // Tests that empty block code path is not broken by per kv checksum. BlockBuilder builder( @@ -845,14 +1090,11 @@ TEST_F(BlockPerKVChecksumTest, EmptyBlock) { std::unique_ptr data_block; Options options = Options(); - BlockBasedTableOptions tbo; uint8_t protection_bytes_per_key = 8; - BlockCreateContext create_context{&tbo, - nullptr, - nullptr /* statistics */, - false /* using_zstd */, - protection_bytes_per_key, - options.comparator}; + BlockCreateContext create_context{ + kTableOptions(), nullptr, + nullptr /* statistics */, kDecompressor(), + protection_bytes_per_key, options.comparator}; create_context.Create(&data_block, std::move(contents)); std::unique_ptr biter{data_block->NewDataIterator( options.comparator, kDisableGlobalSequenceNumber)}; @@ -885,14 +1127,10 @@ TEST_F(BlockPerKVChecksumTest, InitializeProtectionInfo) { // Make sure that the checksum construction code path does not break // when the block is itself already corrupted. Options options = Options(); - BlockBasedTableOptions tbo; uint8_t protection_bytes_per_key = 8; - BlockCreateContext create_context{&tbo, - nullptr /* ioptions */, - nullptr /* statistics */, - false /* using_zstd */, - protection_bytes_per_key, - options.comparator}; + BlockCreateContext create_context{ + kTableOptions(), nullptr /* ioptions */, nullptr /* statistics */, + kDecompressor(), protection_bytes_per_key, options.comparator}; { std::string invalid_content = "1"; @@ -950,20 +1188,19 @@ TEST_F(BlockPerKVChecksumTest, ApproximateMemory) { }; Options options = Options(); - BlockBasedTableOptions tbo; uint8_t protection_bytes_per_key = 8; BlockCreateContext with_checksum_create_context{ - &tbo, + kTableOptions(), nullptr /* ioptions */, nullptr /* statistics */, - false /* using_zstd */, + kDecompressor(), protection_bytes_per_key, options.comparator, true /* index_value_is_full */}; - BlockCreateContext create_context{&tbo, + BlockCreateContext create_context{kTableOptions(), nullptr /* ioptions */, nullptr /* statistics */, - false /* using_zstd */, + kDecompressor(), 0, options.comparator, true /* index_value_is_full */}; @@ -1052,15 +1289,11 @@ class DataBlockKVChecksumTest bool GetUseDeltaEncoding() const { return std::get<3>(GetParam()); } std::unique_ptr GenerateDataBlock( - std::vector &keys, std::vector &values, + std::vector& keys, std::vector& values, int num_record) { - BlockBasedTableOptions tbo; - BlockCreateContext create_context{&tbo, - nullptr /* statistics */, - nullptr /* ioptions */, - false /* using_zstd */, - GetChecksumLen(), - Options().comparator}; + BlockCreateContext create_context{ + kTableOptions(), nullptr /* statistics */, nullptr /* ioptions */, + kDecompressor(), GetChecksumLen(), Options().comparator}; builder_ = std::make_unique( static_cast(GetRestartInterval()), GetUseDeltaEncoding() /* use_delta_encoding */, @@ -1089,9 +1322,9 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(0, 1, 2, 4, 8) /* protection_bytes_per_key */, ::testing::Values(1, 2, 3, 8, 16) /* restart_interval */, ::testing::Values(false, true)) /* delta_encoding */, - [](const testing::TestParamInfo> - &args) { + [](const testing::TestParamInfo< + std::tuple>& args) { std::ostringstream oss; oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtectionPerKey" << std::to_string(std::get<1>(args.param)) @@ -1114,7 +1347,7 @@ TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) { std::unique_ptr data_block = GenerateDataBlock(keys, values, kNumRecords); - const char *checksum_ptr = data_block->TEST_GetKVChecksum(); + const char* checksum_ptr = data_block->TEST_GetKVChecksum(); // Check checksum of correct length is generated for (int i = 0; i < kNumRecords; i++) { ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, @@ -1132,8 +1365,8 @@ TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) { // that case (see Block::VerifyChecksum()). SyncPoint::GetInstance()->SetCallBack( "Block::VerifyChecksum::checksum_len", - [&verification_count, protection_bytes_per_key](void *checksum_len) { - ASSERT_EQ((*static_cast(checksum_len)), + [&verification_count, protection_bytes_per_key](void* checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), protection_bytes_per_key); ++verification_count; }); @@ -1177,17 +1410,16 @@ class IndexBlockKVChecksumTest bool IncludeFirstKey() const { return std::get<4>(GetParam()); } std::unique_ptr GenerateIndexBlock( - std::vector &separators, - std::vector &block_handles, - std::vector &first_keys, int num_record) { + std::vector& separators, + std::vector& block_handles, + std::vector& first_keys, int num_record) { Options options = Options(); - BlockBasedTableOptions tbo; uint8_t protection_bytes_per_key = GetChecksumLen(); BlockCreateContext create_context{ - &tbo, + kTableOptions(), nullptr /* ioptions */, nullptr /* statistics */, - false /* _using_zstd */, + kDecompressor(), protection_bytes_per_key, options.comparator, !UseValueDeltaEncoding() /* value_is_full */, @@ -1236,7 +1468,7 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(true, false), ::testing::Values(true, false)), [](const testing::TestParamInfo< std::tuple> &args) { + uint32_t, bool, bool>>& args) { std::ostringstream oss; oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" << std::to_string(std::get<1>(args.param)) << "RestartInterval" @@ -1260,13 +1492,12 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { std::vector block_handles; std::vector first_keys; GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, - kNumRecords, 0 /* ts_sz */, - seqno != kDisableGlobalSequenceNumber); + kNumRecords, 0 /* ts_sz */); SyncPoint::GetInstance()->DisableProcessing(); std::unique_ptr index_block = GenerateIndexBlock( separators, block_handles, first_keys, kNumRecords); - IndexBlockIter *kNullIter = nullptr; - Statistics *kNullStats = nullptr; + IndexBlockIter* kNullIter = nullptr; + Statistics* kNullStats = nullptr; // read contents of block sequentially std::unique_ptr biter{index_block->NewIndexIterator( options.comparator, seqno, kNullIter, kNullStats, @@ -1277,7 +1508,7 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { true /* user_defined_timestamps_persisted */, nullptr /* prefix_index */)}; biter->SeekToFirst(); - const char *checksum_ptr = index_block->TEST_GetKVChecksum(); + const char* checksum_ptr = index_block->TEST_GetKVChecksum(); // Check checksum of correct length is generated for (int i = 0; i < kNumRecords; i++) { // Obtaining the actual content written as value to index block is not @@ -1297,8 +1528,8 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { // assert checking on checksum_len here. SyncPoint::GetInstance()->SetCallBack( "Block::VerifyChecksum::checksum_len", - [&verification_count, protection_bytes_per_key](void *checksum_len) { - ASSERT_EQ((*static_cast(checksum_len)), + [&verification_count, protection_bytes_per_key](void* checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), protection_bytes_per_key); ++verification_count; }); @@ -1321,17 +1552,13 @@ class MetaIndexBlockKVChecksumTest uint32_t GetRestartInterval() const { return 1; } std::unique_ptr GenerateMetaIndexBlock( - std::vector &keys, std::vector &values, + std::vector& keys, std::vector& values, int num_record) { Options options = Options(); - BlockBasedTableOptions tbo; uint8_t protection_bytes_per_key = GetChecksumLen(); - BlockCreateContext create_context{&tbo, - nullptr /* ioptions */, - nullptr /* statistics */, - false /* using_zstd */, - protection_bytes_per_key, - options.comparator}; + BlockCreateContext create_context{ + kTableOptions(), nullptr /* ioptions */, nullptr /* statistics */, + kDecompressor(), protection_bytes_per_key, options.comparator}; builder_ = std::make_unique(static_cast(GetRestartInterval())); // add a bunch of records to a block @@ -1351,7 +1578,7 @@ class MetaIndexBlockKVChecksumTest INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest, ::testing::Values(0, 1, 2, 4, 8), - [](const testing::TestParamInfo &args) { + [](const testing::TestParamInfo& args) { std::ostringstream oss; oss << "ProtBytes" << std::to_string(args.param); return oss.str(); @@ -1359,14 +1586,10 @@ INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest, TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { Options options = Options(); - BlockBasedTableOptions tbo; uint8_t protection_bytes_per_key = GetChecksumLen(); - BlockCreateContext create_context{&tbo, - nullptr /* ioptions */, - nullptr /* statistics */, - false /* using_zstd */, - protection_bytes_per_key, - options.comparator}; + BlockCreateContext create_context{ + kTableOptions(), nullptr /* ioptions */, nullptr /* statistics */, + kDecompressor(), protection_bytes_per_key, options.comparator}; std::vector num_restart_intervals = {1, 16}; for (const auto num_restart_interval : num_restart_intervals) { const int kNumRecords = num_restart_interval * GetRestartInterval(); @@ -1377,7 +1600,7 @@ TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { SyncPoint::GetInstance()->DisableProcessing(); std::unique_ptr meta_block = GenerateMetaIndexBlock(keys, values, kNumRecords); - const char *checksum_ptr = meta_block->TEST_GetKVChecksum(); + const char* checksum_ptr = meta_block->TEST_GetKVChecksum(); // Check checksum of correct length is generated for (int i = 0; i < kNumRecords; i++) { ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, @@ -1392,8 +1615,8 @@ TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { // checking on checksum_len here. SyncPoint::GetInstance()->SetCallBack( "Block::VerifyChecksum::checksum_len", - [&verification_count, protection_bytes_per_key](void *checksum_len) { - ASSERT_EQ((*static_cast(checksum_len)), + [&verification_count, protection_bytes_per_key](void* checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), protection_bytes_per_key); ++verification_count; }); @@ -1413,7 +1636,7 @@ class DataBlockKVChecksumCorruptionTest : public DataBlockKVChecksumTest { DataBlockKVChecksumCorruptionTest() = default; std::unique_ptr GenerateDataBlockIter( - std::vector &keys, std::vector &values, + std::vector& keys, std::vector& values, int num_record) { // During Block construction, we may create block iter to initialize per kv // checksum. Disable syncpoint that may be created for block iter methods. @@ -1439,15 +1662,15 @@ TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) { GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, 24 /* padding_size */); SyncPoint::GetInstance()->SetCallBack( - "BlockIter::UpdateKey::value", [](void *arg) { - char *value = static_cast(arg); + "BlockIter::UpdateKey::value", [](void* arg) { + char* value = static_cast(arg); // values generated by GenerateRandomKVs are of length 100 ++value[10]; }); // Purely for reducing the number of lines of code. typedef std::unique_ptr IterPtr; - typedef void(IterAPI)(IterPtr & iter, std::string &); + typedef void(IterAPI)(IterPtr & iter, std::string&); std::string seek_key = keys[kNumRecords / 2]; auto test_seek = [&](IterAPI iter_api) { @@ -1458,14 +1681,14 @@ TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) { ASSERT_TRUE(biter->status().IsCorruption()); }; - test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); - test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); - test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); - test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); }); - test_seek([](IterPtr &iter, std::string &k) { iter->SeekForGet(k); }); + test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); }); + test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); }); + test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); }); + test_seek([](IterPtr& iter, std::string& k) { iter->SeekForPrev(k); }); + test_seek([](IterPtr& iter, std::string& k) { iter->SeekForGet(k); }); typedef void (DataBlockIter::*IterStepAPI)(); - auto test_step = [&](IterStepAPI iter_api, std::string &k) { + auto test_step = [&](IterStepAPI iter_api, std::string& k) { IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords); SyncPoint::GetInstance()->DisableProcessing(); biter->Seek(k); @@ -1494,9 +1717,9 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(4, 8) /* block_protection_bytes_per_key */, ::testing::Values(1, 3, 8, 16) /* restart_interval */, ::testing::Values(false, true)), - [](const testing::TestParamInfo> - &args) { + [](const testing::TestParamInfo< + std::tuple>& args) { std::ostringstream oss; oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" << std::to_string(std::get<1>(args.param)) << "RestartInterval" @@ -1510,9 +1733,9 @@ class IndexBlockKVChecksumCorruptionTest : public IndexBlockKVChecksumTest { IndexBlockKVChecksumCorruptionTest() = default; std::unique_ptr GenerateIndexBlockIter( - std::vector &separators, - std::vector &block_handles, - std::vector &first_keys, int num_record, + std::vector& separators, + std::vector& block_handles, + std::vector& first_keys, int num_record, SequenceNumber seqno) { SyncPoint::GetInstance()->DisableProcessing(); block_ = @@ -1545,7 +1768,7 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(true, false), ::testing::Values(true, false)), [](const testing::TestParamInfo< std::tuple> &args) { + uint32_t, bool, bool>>& args) { std::ostringstream oss; oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" << std::to_string(std::get<1>(args.param)) << "RestartInterval" @@ -1567,18 +1790,17 @@ TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) { std::vector block_handles; std::vector first_keys; GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, - kNumRecords, 0 /* ts_sz */, - seqno != kDisableGlobalSequenceNumber); + kNumRecords, 0 /* ts_sz */); SyncPoint::GetInstance()->SetCallBack( - "BlockIter::UpdateKey::value", [](void *arg) { - char *value = static_cast(arg); + "BlockIter::UpdateKey::value", [](void* arg) { + char* value = static_cast(arg); // value can be delta-encoded with different lengths, so we corrupt // first bytes here to be safe ++value[0]; }); typedef std::unique_ptr IterPtr; - typedef void(IterAPI)(IterPtr & iter, std::string &); + typedef void(IterAPI)(IterPtr & iter, std::string&); std::string seek_key = first_keys[kNumRecords / 2]; auto test_seek = [&](IterAPI iter_api) { std::unique_ptr biter = GenerateIndexBlockIter( @@ -1588,12 +1810,12 @@ TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) { ASSERT_FALSE(biter->Valid()); ASSERT_TRUE(biter->status().IsCorruption()); }; - test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); - test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); - test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); }); + test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); }); + test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); }); typedef void (IndexBlockIter::*IterStepAPI)(); - auto test_step = [&](IterStepAPI iter_api, std::string &k) { + auto test_step = [&](IterStepAPI iter_api, std::string& k) { std::unique_ptr biter = GenerateIndexBlockIter( separators, block_handles, first_keys, kNumRecords, seqno); SyncPoint::GetInstance()->DisableProcessing(); @@ -1619,7 +1841,7 @@ class MetaIndexBlockKVChecksumCorruptionTest MetaIndexBlockKVChecksumCorruptionTest() = default; std::unique_ptr GenerateMetaIndexBlockIter( - std::vector &keys, std::vector &values, + std::vector& keys, std::vector& values, int num_record) { SyncPoint::GetInstance()->DisableProcessing(); block_ = GenerateMetaIndexBlock(keys, values, num_record); @@ -1636,7 +1858,7 @@ class MetaIndexBlockKVChecksumCorruptionTest INSTANTIATE_TEST_CASE_P( P, MetaIndexBlockKVChecksumCorruptionTest, ::testing::Values(4, 8) /* block_protection_bytes_per_key */, - [](const testing::TestParamInfo &args) { + [](const testing::TestParamInfo& args) { std::ostringstream oss; oss << "ProtBytes" << std::to_string(args.param); return oss.str(); @@ -1653,14 +1875,14 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) { GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, 24 /* padding_size */); SyncPoint::GetInstance()->SetCallBack( - "BlockIter::UpdateKey::value", [](void *arg) { - char *value = static_cast(arg); + "BlockIter::UpdateKey::value", [](void* arg) { + char* value = static_cast(arg); // values generated by GenerateRandomKVs are of length 100 ++value[10]; }); typedef std::unique_ptr IterPtr; - typedef void(IterAPI)(IterPtr & iter, std::string &); + typedef void(IterAPI)(IterPtr & iter, std::string&); typedef void (MetaBlockIter::*IterStepAPI)(); std::string seek_key = keys[kNumRecords / 2]; auto test_seek = [&](IterAPI iter_api) { @@ -1671,12 +1893,12 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) { ASSERT_TRUE(biter->status().IsCorruption()); }; - test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); - test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); - test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); - test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); }); + test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); }); + test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); }); + test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); }); + test_seek([](IterPtr& iter, std::string& k) { iter->SeekForPrev(k); }); - auto test_step = [&](IterStepAPI iter_api, const std::string &k) { + auto test_step = [&](IterStepAPI iter_api, const std::string& k) { IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords); SyncPoint::GetInstance()->DisableProcessing(); biter->Seek(k); @@ -1696,7 +1918,7 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) { } } // namespace ROCKSDB_NAMESPACE -int main(int argc, char **argv) { +int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h index a9d6a1a773b4..b96f27385493 100644 --- a/table/block_based/block_type.h +++ b/table/block_based/block_type.h @@ -27,8 +27,39 @@ enum class BlockType : uint8_t { kHashIndexMetadata, kMetaIndex, kIndex, + kUserDefinedIndex, // Note: keep kInvalid the last value when adding new enum values. kInvalid }; +inline const char* BlockTypeToString(BlockType block_type) { + switch (block_type) { + case BlockType::kData: + return "Data"; + case BlockType::kFilter: + return "Filter"; + case BlockType::kFilterPartitionIndex: + return "FilterPartitionIndex"; + case BlockType::kProperties: + return "Properties"; + case BlockType::kCompressionDictionary: + return "CompressionDictionary"; + case BlockType::kRangeDeletion: + return "RangeDeletion"; + case BlockType::kHashIndexPrefixes: + return "HashIndexPrefixes"; + case BlockType::kHashIndexMetadata: + return "HashIndexMetadata"; + case BlockType::kMetaIndex: + return "MetaIndex"; + case BlockType::kIndex: + return "Index"; + case BlockType::kUserDefinedIndex: + return "UserDefinedIndex"; + case BlockType::kInvalid: + return "Invalid"; + } + return "Unknown"; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/data_block_footer.cc b/table/block_based/data_block_footer.cc index 5d5d8ed55e4e..24a31c0d52b5 100644 --- a/table/block_based/data_block_footer.cc +++ b/table/block_based/data_block_footer.cc @@ -9,51 +9,55 @@ #include "table/block_based/data_block_footer.h" -#include "rocksdb/table.h" +#include "util/coding.h" namespace ROCKSDB_NAMESPACE { -const int kDataBlockIndexTypeBitShift = 31; +// Hash index bit (bit 31) +constexpr uint32_t kHashIndexBit = 1u << 31; -// 0x7FFFFFFF -const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u; +void DataBlockFooter::EncodeTo(std::string* dst) const { + assert(num_restarts <= kMaxNumRestarts); -// 0x7FFFFFFF -const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u; - -uint32_t PackIndexTypeAndNumRestarts( - BlockBasedTableOptions::DataBlockIndexType index_type, - uint32_t num_restarts) { - if (num_restarts > kMaxNumRestarts) { - assert(0); // mute travis "unused" warning - } - - uint32_t block_footer = num_restarts; + uint32_t packed = num_restarts; if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) { - block_footer |= 1u << kDataBlockIndexTypeBitShift; - } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) { - assert(0); + packed |= kHashIndexBit; + } else { + assert(index_type == BlockBasedTableOptions::kDataBlockBinarySearch); } - return block_footer; + PutFixed32(dst, packed); } -void UnPackIndexTypeAndNumRestarts( - uint32_t block_footer, - BlockBasedTableOptions::DataBlockIndexType* index_type, - uint32_t* num_restarts) { - if (index_type) { - if (block_footer & 1u << kDataBlockIndexTypeBitShift) { - *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; - } else { - *index_type = BlockBasedTableOptions::kDataBlockBinarySearch; - } +Status DataBlockFooter::DecodeFrom(Slice* input) { + if (input->size() < kMinEncodedLength) { + return Status::Corruption("Block too small for footer"); } - if (num_restarts) { - *num_restarts = block_footer & kNumRestartsMask; - assert(*num_restarts <= kMaxNumRestarts); + // Decode from the end of the input + const char* footer_ptr = input->data() + input->size() - kMinEncodedLength; + uint32_t packed = DecodeFixed32(footer_ptr); + + if (packed & kHashIndexBit) { + index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + packed &= ~kHashIndexBit; + } else { + index_type = BlockBasedTableOptions::kDataBlockBinarySearch; } + + // Check for reserved/unrecognized feature bits (anything beyond + // kMaxNumRestarts) + if (packed > kMaxNumRestarts) { + return Status::Corruption( + "Unrecognized feature in block footer (reserved bits set)"); + } + + num_restarts = packed; + + // Remove the footer from the input slice + input->remove_suffix(kMinEncodedLength); + + return Status::OK(); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/data_block_footer.h b/table/block_based/data_block_footer.h index c1cfd473099a..74301d0e0a1a 100644 --- a/table/block_based/data_block_footer.h +++ b/table/block_based/data_block_footer.h @@ -9,17 +9,63 @@ #pragma once +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" #include "rocksdb/table.h" namespace ROCKSDB_NAMESPACE { -uint32_t PackIndexTypeAndNumRestarts( - BlockBasedTableOptions::DataBlockIndexType index_type, - uint32_t num_restarts); +// DataBlockFooter represents the footer of a data block, containing metadata +// about the block's structure and features. +// +// Current encoding (may expand in future format versions): +// - A single uint32_t where: +// - The low 28 bits store the number of restart points (num_restarts) +// - The high 4 bits are reserved for metadata/features: +// - Bit 31: Hash index present (kDataBlockBinaryAndHash) +// - Bits 28-30: Reserved for future features +// +// When any unrecognized reserved bit is set, DecodeFrom() returns an error, +// allowing older versions to fail gracefully on newer formats. +// +// The encoding size is not fixed - future format versions may expand it. +// Use kMaxEncodedLength for buffer sizing. +struct DataBlockFooter { + // Maximum number of restarts that can be stored (2^28 - 1 = 268,435,455). + // This reserves the top 4 bits for metadata (bit 31 for hash index, bits + // 28-30 for future features). For historical compatibility purposes, the + // limit is adequate because a 4GiB block (maximum due to 32-bit block size) + // with restart_interval=1 and minimum entries (12 bytes: 3 varint bytes + + // 9-byte internal key + empty value) plus 4-byte restart offsets = 16 bytes + // per restart, fits at most (2^32 - 4) / 16 ≈ 268 million restarts. + static constexpr uint32_t kMaxNumRestarts = (1u << 28) - 1; + + // Maximum encoded length of a DataBlockFooter (for buffer sizing) + // Currently 4 bytes, but may grow in future format versions. + static constexpr uint32_t kMaxEncodedLength = sizeof(uint32_t); + + // Minimum encoded length (for current format version) + static constexpr uint32_t kMinEncodedLength = sizeof(uint32_t); + + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch; + uint32_t num_restarts = 0; + + DataBlockFooter() = default; + DataBlockFooter(BlockBasedTableOptions::DataBlockIndexType _index_type, + uint32_t _num_restarts) + : index_type(_index_type), num_restarts(_num_restarts) {} + + // Appends the encoded footer to dst. + void EncodeTo(std::string* dst) const; -void UnPackIndexTypeAndNumRestarts( - uint32_t block_footer, - BlockBasedTableOptions::DataBlockIndexType* index_type, - uint32_t* num_restarts); + // Decodes a footer from the end of input (consumes bytes from the end). + // Returns an error if reserved/unrecognized feature bits are set. + // On success, advances input to exclude the consumed footer bytes. + Status DecodeFrom(Slice* input); +}; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 7970ca1d9f9b..5bf0faa14ab0 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -582,7 +582,8 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, const bool kSkipFilters = true; const bool kImmortal = true; ASSERT_OK(moptions.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, + TableReaderOptions(ioptions, moptions.prefix_extractor, + nullptr /* compression_manager */, soptions, internal_comparator, 0 /* block_protection_bytes_per_key */, !kSkipFilters, !kImmortal, level_), diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 6f502cc0e59b..e0c0d094554e 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -68,6 +68,18 @@ class FilterBlockBuilder { // For reporting stats on how many entries the builder considered unique virtual size_t EstimateEntriesAdded() = 0; + // Returns an estimate of the current filter size based on the builder's + // state. Implementations should cache the estimate and update it via + // UpdateFilterSizeEstimate() to avoid recalculating on every key add. + // + // Can be called at any time during table construction, even before calling + // Finish(). Used during table construction to determine when to cut files. + virtual size_t CurrentFilterSizeEstimate() = 0; + + // Provides a hook for filter builder when a data block is finalized, such as + // to update cached filter size estimates. + virtual void OnDataBlockFinalized(uint64_t /* num_data_blocks */) {} + // When using AddWithPrevKey, this must be called before Finish(). (May also // be called without AddWithPrevKey, but prev_key_without_ts must be // accurate regardless.) @@ -110,6 +122,11 @@ class FilterBlockBuilder { return filter; } #endif // NDEBUG + + protected: + // Update cached filter size estimate. Subclasses should override to update + // estimates based on their internal state. + virtual void UpdateFilterSizeEstimate(uint64_t /* num_data_blocks */) {} }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc index 343e9406b571..32c43ac09f3c 100644 --- a/table/block_based/filter_block_reader_common.cc +++ b/table/block_based/filter_block_reader_common.cc @@ -30,8 +30,7 @@ Status FilterBlockReaderCommon::ReadFilterBlock( const Status s = table->RetrieveBlock( prefetch_buffer, read_options, rep->filter_handle, - UncompressionDict::GetEmptyDict(), filter_block, get_context, - lookup_context, + /* decomp */ nullptr, filter_block, get_context, lookup_context, /* for_compaction */ false, use_cache, /* async_read */ false, /* use_block_cache_for_lookup */ true); diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 3df973aa4ca8..cdc4c144c369 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -17,7 +17,6 @@ #include #include -#include "cache/cache_entry_roles.h" #include "cache/cache_reservation_manager.h" #include "logging/logging.h" #include "port/lang.h" @@ -29,8 +28,8 @@ #include "table/block_based/block_based_table_reader.h" #include "table/block_based/filter_policy_internal.h" #include "table/block_based/full_filter_block.h" +#include "util/atomic.h" #include "util/bloom_impl.h" -#include "util/coding.h" #include "util/hash.h" #include "util/math.h" #include "util/ribbon_config.h" @@ -61,7 +60,7 @@ Slice FinishAlwaysTrue(std::unique_ptr* /*buf*/) { // Base class for filter builders using the XXH3 preview hash, // also known as Hash64 or GetSliceHash64. -class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { +class XXPH3FilterBitsBuilder : public FilterBitsBuilder { public: explicit XXPH3FilterBitsBuilder( std::atomic* aggregate_rounding_balance, @@ -126,8 +125,11 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { } } + // Returns an estimate of the number of entries added to the + // filter. This method is thread-safe and can be safely called + // from background threads during parallel compression. size_t EstimateEntriesAdded() override { - return hash_entries_info_.entries.size(); + return hash_entries_info_.entries_count.LoadRelaxed(); } Status MaybePostVerify(const Slice& filter_content) override; @@ -147,6 +149,7 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { hash_entries_info_.xor_checksum ^= hash; } hash_entries_info_.entries.push_back(hash); + hash_entries_info_.entries_count.FetchAddRelaxed(1); if (cache_res_mgr_ && // Traditional rounding to whole bucket size ((hash_entries_info_.entries.size() % @@ -314,6 +317,10 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // and has near-minimal peak memory use. std::deque entries; + // Tracks the number of entries added for thread-safe + // size estimation. + RelaxedAtomic entries_count{0}; + // If cache_res_mgr_ != nullptr, // it manages cache charge for buckets of hash entries in (new) Bloom // or Ribbon Filter construction. @@ -332,6 +339,8 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { void Swap(HashEntriesInfo* other) { assert(other != nullptr); std::swap(entries, other->entries); + entries_count.StoreRelaxed( + other->entries_count.ExchangeRelaxed(entries_count.LoadRelaxed())); std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles); std::swap(xor_checksum, other->xor_checksum); std::swap(prev_alt_hash, other->prev_alt_hash); @@ -339,6 +348,7 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { void Reset() { entries.clear(); + entries_count.StoreRelaxed(0); cache_res_bucket_handles.clear(); xor_checksum = 0; prev_alt_hash = {}; @@ -1012,9 +1022,6 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { FastLocalBloomBitsBuilder bloom_fallback_; }; -// for the linker, at least with DEBUG_LEVEL=2 -constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries; - class Standard128RibbonBitsReader : public BuiltinFilterBitsReader { public: Standard128RibbonBitsReader(const char* data, size_t len_bytes, @@ -1069,7 +1076,7 @@ class Standard128RibbonBitsReader : public BuiltinFilterBitsReader { using LegacyBloomImpl = LegacyLocalityBloomImpl; -class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { +class LegacyBloomBitsBuilder : public FilterBitsBuilder { public: explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log); diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index a823bf059732..3e6df57194dc 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -90,6 +90,19 @@ class FilterBitsBuilder { // <= the specified number of bytes. Callers (including RocksDB) should // only use this result for optimizing performance and not as a guarantee. virtual size_t ApproximateNumEntries(size_t bytes) = 0; + + // Calculate number of bytes needed for a new filter, including + // metadata. Passing the result to ApproximateNumEntries should + // (ideally, usually) return >= the num_entry passed in. + // When optimize_filters_for_memory is enabled, this function + // is not authoritative but represents a target size that should + // be close to the average size. + virtual size_t CalculateSpace(size_t num_entries) = 0; + + // Returns an estimate of the FP rate of the returned filter if + // `num_entries` keys are added and the filter returned by Finish + // is `bytes` bytes. + virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0; }; // A class that checks if a key can be in filter @@ -109,24 +122,6 @@ class FilterBitsReader { } }; -// Exposes any extra information needed for testing built-in -// FilterBitsBuilders -class BuiltinFilterBitsBuilder : public FilterBitsBuilder { - public: - // Calculate number of bytes needed for a new filter, including - // metadata. Passing the result to ApproximateNumEntries should - // (ideally, usually) return >= the num_entry passed in. - // When optimize_filters_for_memory is enabled, this function - // is not authoritative but represents a target size that should - // be close to the average size. - virtual size_t CalculateSpace(size_t num_entries) = 0; - - // Returns an estimate of the FP rate of the returned filter if - // `num_entries` keys are added and the filter returned by Finish - // is `bytes` bytes. - virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0; -}; - // Base class for RocksDB built-in filter reader with // extra useful functionalities for inernal. class BuiltinFilterBitsReader : public FilterBitsReader { diff --git a/table/block_based/flush_block_policy.cc b/table/block_based/flush_block_policy.cc index d5cc310013f2..f01315ceb970 100644 --- a/table/block_based/flush_block_policy.cc +++ b/table/block_based/flush_block_policy.cc @@ -19,7 +19,7 @@ namespace ROCKSDB_NAMESPACE { // Flush block by size -class FlushBlockBySizePolicy : public FlushBlockPolicy { +class FlushBlockBySizePolicy : public RetargetableFlushBlockPolicy { public: // @params block_size: Approximate size of user data packed per // block. @@ -28,19 +28,19 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { FlushBlockBySizePolicy(const uint64_t block_size, const uint64_t block_size_deviation, const bool align, const BlockBuilder& data_block_builder) - : block_size_(block_size), + : RetargetableFlushBlockPolicy(data_block_builder), + block_size_(block_size), block_size_deviation_limit_( ((block_size * (100 - block_size_deviation)) + 99) / 100), - align_(align), - data_block_builder_(data_block_builder) {} + align_(align) {} bool Update(const Slice& key, const Slice& value) override { // it makes no sense to flush when the data block is empty - if (data_block_builder_.empty()) { + if (data_block_builder_->empty()) { return false; } - auto curr_size = data_block_builder_.CurrentSizeEstimate(); + auto curr_size = data_block_builder_->CurrentSizeEstimate(); // Do flush if one of the below two conditions is true: // 1) if the current estimated size already exceeds the block size, @@ -56,9 +56,9 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { return false; } - const auto curr_size = data_block_builder_.CurrentSizeEstimate(); + const auto curr_size = data_block_builder_->CurrentSizeEstimate(); auto estimated_size_after = - data_block_builder_.EstimateSizeAfterKV(key, value); + data_block_builder_->EstimateSizeAfterKV(key, value); if (align_) { estimated_size_after += BlockBasedTable::kBlockTrailerSize; @@ -72,7 +72,6 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { const uint64_t block_size_; const uint64_t block_size_deviation_limit_; const bool align_; - const BlockBuilder& data_block_builder_; }; FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( @@ -83,10 +82,18 @@ FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_options.block_align, data_block_builder); } +std::unique_ptr NewFlushBlockBySizePolicy( + const uint64_t size, const int deviation, + const BlockBuilder& data_block_builder) { + return std::make_unique(size, deviation, false, + data_block_builder); +} + FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( const uint64_t size, const int deviation, const BlockBuilder& data_block_builder) { - return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder); + return NewFlushBlockBySizePolicy(size, deviation, data_block_builder) + .release(); } static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library, diff --git a/table/block_based/flush_block_policy_impl.h b/table/block_based/flush_block_policy_impl.h index 4f79682bc25f..96132304d6e0 100644 --- a/table/block_based/flush_block_policy_impl.h +++ b/table/block_based/flush_block_policy_impl.h @@ -3,6 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#pragma once #include "rocksdb/flush_block_policy.h" namespace ROCKSDB_NAMESPACE { @@ -37,4 +38,23 @@ class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory { } }; +// For internal use, policy that is stateless after creation, meaning it can +// be safely re-targeted to another block builder. +class RetargetableFlushBlockPolicy : public FlushBlockPolicy { + public: + explicit RetargetableFlushBlockPolicy(const BlockBuilder& data_block_builder) + : data_block_builder_(&data_block_builder) {} + + void Retarget(const BlockBuilder& data_block_builder) { + data_block_builder_ = &data_block_builder; + } + + protected: + const BlockBuilder* data_block_builder_; +}; + +std::unique_ptr NewFlushBlockBySizePolicy( + const uint64_t size, const int deviation, + const BlockBuilder& data_block_builder); + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index af741787a32d..c7d069f3e524 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -30,6 +30,35 @@ size_t FullFilterBlockBuilder::EstimateEntriesAdded() { return filter_bits_builder_->EstimateEntriesAdded(); } +void FullFilterBlockBuilder::OnDataBlockFinalized(uint64_t num_data_blocks) { + UpdateFilterSizeEstimate(num_data_blocks); +} + +size_t FullFilterBlockBuilder::CurrentFilterSizeEstimate() { + return estimated_filter_size_; +} + +void FullFilterBlockBuilder::UpdateFilterSizeEstimate( + uint64_t num_data_blocks) { + size_t entries_added = filter_bits_builder_->EstimateEntriesAdded(); + + if (entries_added == 0) { + estimated_filter_size_ = 0; + return; + } + + size_t filter_size = filter_bits_builder_->CalculateSpace(entries_added); + + // Reserve filter space for next data block ~2x the average. + size_t buffer_size = 0; + if (num_data_blocks > 0) { + buffer_size = (filter_size / num_data_blocks) * 2; + estimated_filter_size_ = filter_size + buffer_size; + } else { + estimated_filter_size_ = filter_size; + } +} + void FullFilterBlockBuilder::AddWithPrevKey( const Slice& key_without_ts, const Slice& /*prev_key_without_ts*/) { FullFilterBlockBuilder::Add(key_without_ts); diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 784f0eb881c3..96e8300b2086 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -57,6 +57,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { return filter_bits_builder_->EstimateEntriesAdded() == 0; } size_t EstimateEntriesAdded() override; + size_t CurrentFilterSizeEstimate() override; + void OnDataBlockFinalized(uint64_t num_data_blocks) override; Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter, std::unique_ptr* filter_owner = nullptr) override; using FilterBlockBuilder::Finish; @@ -73,6 +75,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { std::unique_ptr filter_bits_builder_; + void UpdateFilterSizeEstimate(uint64_t num_data_blocks_written) override; + private: // important: all of these might point to invalid addresses // at the time of destruction of this filter block. destructor @@ -80,6 +84,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { const SliceTransform* const prefix_extractor_; const bool whole_key_filtering_; std::unique_ptr filter_data_; + + size_t estimated_filter_size_ = 0; }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index f90492d8583b..1ce6844741eb 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -52,6 +52,13 @@ class TestFilterBitsBuilder : public FilterBitsBuilder { size_t ApproximateNumEntries(size_t bytes) override { return bytes / 4; } + size_t CalculateSpace(size_t num_entries) override { return num_entries * 4; } + + double EstimatedFpRate(size_t /* num_entries */, + size_t /* bytes */) override { + return 0.0; + } + private: std::vector hash_entries_; }; @@ -229,6 +236,14 @@ class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder { return b_->ApproximateNumEntries(bytes); } + size_t CalculateSpace(size_t num_entries) override { + return b_->CalculateSpace(num_entries); + } + + double EstimatedFpRate(size_t num_entries, size_t bytes) override { + return b_->EstimatedFpRate(num_entries, bytes); + } + size_t CountUnique() { return uniq_.size(); } }; diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc index 2cf67367b998..1a6c0aeb0f06 100644 --- a/table/block_based/hash_index_reader.cc +++ b/table/block_based/hash_index_reader.cc @@ -76,8 +76,8 @@ Status HashIndexReader::Create(const BlockBasedTable* table, BlockFetcher prefixes_block_fetcher( file, prefetch_buffer, footer, ro, prefixes_handle, &prefixes_contents, ioptions, true /*decompress*/, true /*maybe_compressed*/, - BlockType::kHashIndexPrefixes, UncompressionDict::GetEmptyDict(), - cache_options, memory_allocator); + BlockType::kHashIndexPrefixes, rep->decompressor.get(), cache_options, + memory_allocator); s = prefixes_block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; @@ -87,7 +87,7 @@ Status HashIndexReader::Create(const BlockBasedTable* table, file, prefetch_buffer, footer, ro, prefixes_meta_handle, &prefixes_meta_contents, ioptions, true /*decompress*/, true /*maybe_compressed*/, BlockType::kHashIndexMetadata, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + rep->decompressor.get(), cache_options, memory_allocator); s = prefixes_meta_block_fetcher.ReadBlockContents(); if (!s.ok()) { // TODO: log error diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc index a5a34d65b670..8de01f0b7a22 100644 --- a/table/block_based/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -66,7 +66,7 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( break; } default: { - assert(!"Do not recognize the index type "); + assert(false && "Do not recognize the index type "); break; } } @@ -117,6 +117,20 @@ Slice ShortenedIndexBuilder::FindShortInternalKeySuccessor( } } +void ShortenedIndexBuilder::UpdateIndexSizeEstimate() { + uint64_t current_size = + must_use_separator_with_seq_.LoadRelaxed() + ? index_block_builder_.CurrentSizeEstimate() + : index_block_builder_without_seq_.CurrentSizeEstimate(); + + uint64_t final_estimate = current_size; + if (num_index_entries_ > 0) { + // Add buffer to generously account (in most cases) for the next index entry + final_estimate += (2 * (current_size / num_index_entries_)); + } + estimated_index_size_.StoreRelaxed(final_estimate); +} + PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( const InternalKeyComparator* comparator, const bool use_value_delta_encoding, @@ -152,32 +166,43 @@ PartitionedIndexBuilder::PartitionedIndexBuilder( // sub_index_builder. Otherwise, it could be set to true even one of the // sub_index_builders could not safely exclude seq from the keys, then it // wil be enforced on all sub_index_builders on ::Finish. - seperator_is_key_plus_seq_(false), - use_value_delta_encoding_(use_value_delta_encoding) {} + must_use_separator_with_seq_(false), + use_value_delta_encoding_(use_value_delta_encoding) { + MakeNewSubIndexBuilder(); +} void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { - assert(sub_index_builder_ == nullptr); - sub_index_builder_ = std::make_unique( + auto new_builder = std::make_unique( comparator_, table_opt_.index_block_restart_interval, table_opt_.format_version, use_value_delta_encoding_, table_opt_.index_shortening, /* include_first_key */ false, ts_sz_, persist_user_defined_timestamps_); + sub_index_builder_ = new_builder.get(); + // Start next partition entry, where we will modify the key + entries_.push_back({{}, std::move(new_builder)}); - // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if - // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by + BlockBuilder* builder_to_monitor; + // Set sub_index_builder_->must_use_separator_with_seq_ to true if + // must_use_separator_with_seq_ is true (internal-key mode) (set to false by // default on Creation) so that flush policy can point to // sub_index_builder_->index_block_builder_ - if (seperator_is_key_plus_seq_) { - sub_index_builder_->seperator_is_key_plus_seq_ = true; + if (must_use_separator_with_seq_.LoadRelaxed()) { + sub_index_builder_->must_use_separator_with_seq_.StoreRelaxed(true); + builder_to_monitor = &sub_index_builder_->index_block_builder_; + } else { + builder_to_monitor = &sub_index_builder_->index_block_builder_without_seq_; } - flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( - table_opt_.metadata_block_size, table_opt_.block_size_deviation, - // Note: this is sub-optimal since sub_index_builder_ could later reset - // seperator_is_key_plus_seq_ but the probability of that is low. - sub_index_builder_->seperator_is_key_plus_seq_ - ? sub_index_builder_->index_block_builder_ - : sub_index_builder_->index_block_builder_without_seq_)); + if (flush_policy_ == nullptr) { + // Note: some partitions could be sub-optimal since sub_index_builder_ + // could later reset must_use_separator_with_seq_ but the probability and + // impact of that are low. + flush_policy_ = NewFlushBlockBySizePolicy(table_opt_.metadata_block_size, + table_opt_.block_size_deviation, + *builder_to_monitor); + } else { + flush_policy_->Retarget(*builder_to_monitor); + } partition_cut_requested_ = false; } @@ -185,101 +210,135 @@ void PartitionedIndexBuilder::RequestPartitionCut() { partition_cut_requested_ = true; } +std::unique_ptr +PartitionedIndexBuilder::CreatePreparedIndexEntry() { + // Fortunately, for ShortenedIndexBuilder, we can prepare an entry from one + // similarly configured builder and finish it at another. + return entries_.front().value->CreatePreparedIndexEntry(); +} +void PartitionedIndexBuilder::PrepareIndexEntry( + const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, PreparedIndexEntry* out) { + // Fortunately, for ShortenedIndexBuilder, we can prepare an entry from one + // similarly configured builder and finish it at another. We just have to + // keep in mind that this first sub builder keeps track of the original + // must_use_separator_with_seq_ in the pipeline that is then propagated. + return entries_.front().value->PrepareIndexEntry( + last_key_in_current_block, first_key_in_next_block, out); +} + +void PartitionedIndexBuilder::MaybeFlush(const Slice& index_key, + const BlockHandle& index_value) { + bool do_flush = !sub_index_builder_->index_block_builder_.empty() && + (partition_cut_requested_ || + flush_policy_->Update( + index_key, EncodedBlockHandle(index_value).AsSlice())); + if (do_flush) { + assert(entries_.back().value.get() == sub_index_builder_); + + // Update estimate of completed partitions when a partition is flushed + estimated_completed_partitions_size_.FetchAddRelaxed( + sub_index_builder_->CurrentIndexSizeEstimate()); + + cut_filter_block = true; + MakeNewSubIndexBuilder(); + } +} + +void PartitionedIndexBuilder::FinishIndexEntry(const BlockHandle& block_handle, + PreparedIndexEntry* base_entry, + bool skip_delta_encoding) { + using SPIE = ShortenedIndexBuilder::ShortenedPreparedIndexEntry; + SPIE* entry = static_cast(base_entry); + + MaybeFlush(entry->separator_with_seq, block_handle); + + sub_index_builder_->FinishIndexEntry(block_handle, base_entry, + skip_delta_encoding); + std::swap(entries_.back().key, entry->separator_with_seq); + + // Update cached size estimate when data blocks are finalized for more + // accurate tail size estimation. This is needed for parallel compression + // which uses FinishIndexEntry() instead of AddIndexEntry(). + UpdateIndexSizeEstimate(); + + if (!must_use_separator_with_seq_.LoadRelaxed() && + entry->must_use_separator_with_seq) { + // We need to apply !must_use_separator_with_seq to all sub-index builders + must_use_separator_with_seq_.StoreRelaxed(true); + flush_policy_->Retarget(sub_index_builder_->index_block_builder_); + } + // NOTE: not compatible with coupled partitioned filters so don't need to + // cut_filter_block +} + Slice PartitionedIndexBuilder::AddIndexEntry( const Slice& last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle, - std::string* separator_scratch) { - // Note: to avoid two consecuitive flush in the same method call, we do not - // check flush policy when adding the last key - if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys - if (sub_index_builder_ == nullptr) { - MakeNewSubIndexBuilder(); - // Reserve next partition entry, where we will modify the key and - // eventually set the value - entries_.push_back({{}, {}}); - } - auto sep = sub_index_builder_->AddIndexEntry( - last_key_in_current_block, first_key_in_next_block, block_handle, - separator_scratch); - if (!seperator_is_key_plus_seq_ && - sub_index_builder_->seperator_is_key_plus_seq_) { - // We need to apply !seperator_is_key_plus_seq to all sub-index builders - seperator_is_key_plus_seq_ = true; - // Would associate flush_policy with the appropriate builder, but it won't - // be used again with no more keys - flush_policy_.reset(); - } - entries_.back().key.assign(sep.data(), sep.size()); - assert(entries_.back().value == nullptr); - std::swap(entries_.back().value, sub_index_builder_); + std::string* separator_scratch, bool skip_delta_encoding) { + // At least when running without parallel compression, maintain behavior of + // avoiding a last index partition with just one entry + if (first_key_in_next_block) { + MaybeFlush(last_key_in_current_block, block_handle); + } + + auto sep = sub_index_builder_->AddIndexEntry( + last_key_in_current_block, first_key_in_next_block, block_handle, + separator_scratch, skip_delta_encoding); + entries_.back().key.assign(sep.data(), sep.size()); + + // Update cached size estimate when data blocks are finalized for more + // accurate tail size estimation. This ensures the estimate reflects current + // state after each data block is added. + UpdateIndexSizeEstimate(); + + if (!must_use_separator_with_seq_.LoadRelaxed() && + sub_index_builder_->must_use_separator_with_seq_.LoadRelaxed()) { + // We need to apply !must_use_separator_with_seq to all sub-index builders + must_use_separator_with_seq_.StoreRelaxed(true); + flush_policy_->Retarget(sub_index_builder_->index_block_builder_); + } + if (UNLIKELY(first_key_in_next_block == nullptr)) { + // no more keys cut_filter_block = true; - return sep; - } else { - // apply flush policy only to non-empty sub_index_builder_ - if (sub_index_builder_ != nullptr) { - std::string handle_encoding; - block_handle.EncodeTo(&handle_encoding); - bool do_flush = - partition_cut_requested_ || - flush_policy_->Update(last_key_in_current_block, handle_encoding); - if (do_flush) { - assert(entries_.back().value == nullptr); - std::swap(entries_.back().value, sub_index_builder_); - cut_filter_block = true; - } - } - if (sub_index_builder_ == nullptr) { - MakeNewSubIndexBuilder(); - // Reserve next partition entry, where we will modify the key and - // eventually set the value - entries_.push_back({{}, {}}); - } - auto sep = sub_index_builder_->AddIndexEntry( - last_key_in_current_block, first_key_in_next_block, block_handle, - separator_scratch); - entries_.back().key.assign(sep.data(), sep.size()); - if (!seperator_is_key_plus_seq_ && - sub_index_builder_->seperator_is_key_plus_seq_) { - // We need to apply !seperator_is_key_plus_seq to all sub-index builders - seperator_is_key_plus_seq_ = true; - // And use a flush_policy with the appropriate builder - flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( - table_opt_.metadata_block_size, table_opt_.block_size_deviation, - sub_index_builder_->index_block_builder_)); - } - return sep; } + return sep; } Status PartitionedIndexBuilder::Finish( IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { if (partition_cnt_ == 0) { - partition_cnt_ = entries_.size(); + sub_index_builder_ = nullptr; + if (!entries_.empty()) { + // Remove the last entry if it is empty + if (entries_.back().value->index_block_builder_.empty()) { + assert(entries_.back().key.empty()); + entries_.pop_back(); + } + partition_cnt_ = entries_.size(); + } } - // It must be set to null after last key is added - assert(sub_index_builder_ == nullptr); - if (finishing_indexes == true) { + if (finishing_indexes_ == true) { Entry& last_entry = entries_.front(); - std::string handle_encoding; - last_partition_block_handle.EncodeTo(&handle_encoding); + EncodedBlockHandle handle_encoding(last_partition_block_handle); std::string handle_delta_encoding; PutVarsignedint64( &handle_delta_encoding, last_partition_block_handle.size() - last_encoded_handle_.size()); last_encoded_handle_ = last_partition_block_handle; const Slice handle_delta_encoding_slice(handle_delta_encoding); - index_block_builder_.Add(last_entry.key, handle_encoding, + index_block_builder_.Add(last_entry.key, handle_encoding.AsSlice(), &handle_delta_encoding_slice); - if (!seperator_is_key_plus_seq_) { + if (!must_use_separator_with_seq_.LoadRelaxed()) { index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key), - handle_encoding, + handle_encoding.AsSlice(), &handle_delta_encoding_slice); } entries_.pop_front(); } // If there is no sub_index left, then return the 2nd level index. if (UNLIKELY(entries_.empty())) { - if (seperator_is_key_plus_seq_) { + if (must_use_separator_with_seq_.LoadRelaxed()) { index_blocks->index_block_contents = index_block_builder_.Finish(); } else { index_blocks->index_block_contents = @@ -293,13 +352,59 @@ Status PartitionedIndexBuilder::Finish( // expect more calls to Finish Entry& entry = entries_.front(); // Apply the policy to all sub-indexes - entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; + entry.value->must_use_separator_with_seq_.StoreRelaxed( + must_use_separator_with_seq_.LoadRelaxed()); auto s = entry.value->Finish(index_blocks); index_size_ += index_blocks->index_block_contents.size(); - finishing_indexes = true; + finishing_indexes_ = true; return s.ok() ? Status::Incomplete() : s; } } size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; } + +void PartitionedIndexBuilder::UpdateIndexSizeEstimate() { + uint64_t total_size = 0; + + // Ignore last entry which is a placeholder for the partition being built + size_t completed_partitions = entries_.size() > 0 ? entries_.size() - 1 : 0; + + // Use running estimate of completed partitions instead of IndexSize() which + // is only available after calling Finish(). + uint64_t completed_partitions_size = + estimated_completed_partitions_size_.LoadRelaxed(); + total_size += completed_partitions_size; + + // Add current active partition size if it exists + uint64_t current_sub_index_size = 0; + if (sub_index_builder_ != nullptr) { + current_sub_index_size = sub_index_builder_->CurrentIndexSizeEstimate(); + total_size += current_sub_index_size; + } + + // Add buffer for top-level index and next partition + uint64_t buffer_size = 0; + if (completed_partitions > 0) { + // Calculate top-level index size. Each top-level entry consists of: + // separator key (~20-50 bytes) + BlockHandle (~20 bytes) + overhead + // Estimate ~70 bytes per top-level entry as a reasonable average + auto estimated_top_level_size = completed_partitions * 70; + total_size += completed_partitions * 70; + + // Buffer for next partition + next top-level entry + uint64_t avg_partition_size = + completed_partitions_size / completed_partitions; + uint64_t avg_top_level_entry_size = + estimated_top_level_size / completed_partitions; + + buffer_size = 2 * (avg_partition_size + avg_top_level_entry_size); + total_size += buffer_size; + } else if (sub_index_builder_ != nullptr) { + // For the first partition, estimate using the current partition's state + buffer_size = 2 * current_sub_index_size; + total_size += buffer_size; + } + estimated_index_size_.StoreRelaxed(total_size); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h index 99b348b2ff1d..a33935c051d3 100644 --- a/table/block_based/index_builder.h +++ b/table/block_based/index_builder.h @@ -18,7 +18,9 @@ #include "rocksdb/comparator.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_builder.h" +#include "table/block_based/flush_block_policy_impl.h" #include "table/format.h" +#include "util/atomic.h" namespace ROCKSDB_NAMESPACE { // The interface for building index. @@ -46,7 +48,7 @@ class IndexBuilder { // primary index. struct IndexBlocks { Slice index_block_contents; - std::unordered_map meta_blocks; + std::unordered_map> meta_blocks; }; IndexBuilder(const InternalKeyComparator* comparator, size_t ts_sz, bool persist_user_defined_timestamps) @@ -67,6 +69,9 @@ class IndexBuilder { // the last one in the table // @separator_scratch: a scratch buffer to back a computed separator between // those, as needed. May be modified on each call. + // @skip_delta_encoding: whether to skip delta encoding for this index entry + // for cases of violating the assumption that this + // block_handle starts where the last one ended. // @return: the key or separator stored in the index, which could be // last_key_in_current_block or a computed separator backed by // separator_scratch or last_key_in_current_block. @@ -74,11 +79,57 @@ class IndexBuilder { virtual Slice AddIndexEntry(const Slice& last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle, - std::string* separator_scratch) = 0; + std::string* separator_scratch, + bool skip_delta_encoding) = 0; + + // An abstract (extensible) holder for passing data from PrepareIndexEntry to + // FinishIndexEntry (see below). + struct PreparedIndexEntry { + virtual ~PreparedIndexEntry() = default; + }; + + // Parallel compression/construction alternative to AddIndexEntry, 1/3 + // + // This function creates a holder for data that needs to be passed from + // PrepareIndexEntry to FinishIndexEntry, depending on the implementation + // of those. Few of these are created and reused, so construction/destruction + // performance is not critical. + virtual std::unique_ptr CreatePreparedIndexEntry() = 0; + + // Parallel compression/construction alternative to AddIndexEntry, 2/3 + // + // One thread calls this function for successive index entries to compute and + // record in `out` what is needed to build the index entry EXCEPT for the + // BlockHandle, which will only be known later. That thread is generally the + // same thread as calls every other function such as OnKeyAdded EXCEPT + // FinishIndexEntry (see below). This function should be considered "mostly + // stateless" but might modify state distinct from what is modified by + // FinishIndexEntry. Ideally synchronization within the IndexBuilder can be + // avoided. + // + // The passed-in PreparedIndexEntry object is likely reused so might be + // passed-in in any state. + virtual void PrepareIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + PreparedIndexEntry* out) = 0; + + // Parallel compression/construction alternative to AddIndexEntry, 3/3 + // + // This function is called by a different thread than PrepareIndexEntry, but + // is called on entries in the same order as PrepareIndexEntry, passed in the + // PreparedIndexEntry objects populated by PrepareIndexEntry. This function + // finishes the same effect of AddIndexEntry but split across a few functions. + // + // External synchronization ensures Finish is only called after all the + // FinishIndexEntry calls have completed. + virtual void FinishIndexEntry(const BlockHandle& block_handle, + PreparedIndexEntry* entry, + bool skip_delta_encoding) = 0; // This method will be called whenever a key is added. The subclasses may // override OnKeyAdded() if they need to collect additional information. - virtual void OnKeyAdded(const Slice& /*key*/) {} + virtual void OnKeyAdded(const Slice& /*key*/, + const std::optional& /*value*/) {} // Inform the index builder that all entries has been written. Block builder // may therefore perform any operation required for block finalization. @@ -108,7 +159,17 @@ class IndexBuilder { // Get the size for index block. Must be called after ::Finish. virtual size_t IndexSize() const = 0; - virtual bool seperator_is_key_plus_seq() { return true; } + // Returns an estimate of the current index size based on the builder's state. + // Implementations should cache the estimate and update it via + // UpdateIndexSizeEstimate() to avoid recalculating on every key add, + // which is critical for performance in the compaction hot path. + // + // This function is only called by the SST "emit thread" but must be + // thread safe with concurrent calls to UpdateIndexSizeEstimate() from another + // thread (such as during parallel compression). + virtual uint64_t CurrentIndexSizeEstimate() const = 0; + + virtual bool separator_is_key_plus_seq() { return true; } protected: // Given the last key in current block and the first key in the next block, @@ -116,7 +177,7 @@ class IndexBuilder { // can be used as separator. inline bool ShouldUseKeyPlusSeqAsSeparator( const Slice& last_key_in_current_block, - const Slice& first_key_in_next_block) { + const Slice& first_key_in_next_block) const { Slice l_user_key = ExtractUserKey(last_key_in_current_block); Slice r_user_key = ExtractUserKey(first_key_in_next_block); // If user defined timestamps are not persisted. All the user keys will @@ -130,6 +191,13 @@ class IndexBuilder { l_user_key, r_user_key) == 0; } + // Updates the cached index size estimate used by CurrentIndexSizeEstimate(). + // + // This function can be called from the SST "write thread" (via + // FinishIndexEntry()), and needs to be thread safe with + // CurrentIndexSizeEstimate() called from the SST "emit thread". + virtual void UpdateIndexSizeEstimate() {} + const InternalKeyComparator* comparator_; // Size of user-defined timestamp in bytes. size_t ts_sz_; @@ -177,63 +245,78 @@ class ShortenedIndexBuilder : public IndexBuilder { include_first_key_(include_first_key), shortening_mode_(shortening_mode) { // Making the default true will disable the feature for old versions - seperator_is_key_plus_seq_ = (format_version <= 2); + must_use_separator_with_seq_.StoreRelaxed(format_version <= 2); } - void OnKeyAdded(const Slice& key) override { + void OnKeyAdded(const Slice& key, + const std::optional& /*value*/) override { if (include_first_key_ && current_block_first_internal_key_.empty()) { current_block_first_internal_key_.assign(key.data(), key.size()); } } - Slice AddIndexEntry(const Slice& last_key_in_current_block, - const Slice* first_key_in_next_block, - const BlockHandle& block_handle, - std::string* separator_scratch) override { - Slice separator; + Slice GetSeparatorWithSeq(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + std::string* separator_scratch) { + Slice separator_with_seq; if (first_key_in_next_block != nullptr) { if (shortening_mode_ != BlockBasedTableOptions::IndexShorteningMode::kNoShortening) { - separator = FindShortestInternalKeySeparator( + separator_with_seq = FindShortestInternalKeySeparator( *comparator_->user_comparator(), last_key_in_current_block, *first_key_in_next_block, separator_scratch); } else { - separator = last_key_in_current_block; + separator_with_seq = last_key_in_current_block; } - if (!seperator_is_key_plus_seq_ && + if (!must_use_separator_with_seq_.LoadRelaxed() && ShouldUseKeyPlusSeqAsSeparator(last_key_in_current_block, *first_key_in_next_block)) { - seperator_is_key_plus_seq_ = true; + must_use_separator_with_seq_.StoreRelaxed(true); } } else { if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode:: kShortenSeparatorsAndSuccessor) { - separator = FindShortInternalKeySuccessor( + separator_with_seq = FindShortInternalKeySuccessor( *comparator_->user_comparator(), last_key_in_current_block, separator_scratch); } else { - separator = last_key_in_current_block; + separator_with_seq = last_key_in_current_block; } } + return separator_with_seq; + } - assert(!include_first_key_ || !current_block_first_internal_key_.empty()); + Slice GetFirstInternalKey(std::string* first_internal_key_buf) const { + if (!include_first_key_) { + return Slice(); + } + assert(!current_block_first_internal_key_.empty()); // When UDT should not be persisted, the index block builders take care of // stripping UDT from the key, for the first internal key contained in the // IndexValue, we need to explicitly do the stripping here before passing // it to the block builders. - std::string first_internal_key_buf; Slice first_internal_key = current_block_first_internal_key_; if (!current_block_first_internal_key_.empty() && ts_sz_ > 0 && !persist_user_defined_timestamps_) { - StripTimestampFromInternalKey(&first_internal_key_buf, + first_internal_key_buf->clear(); + StripTimestampFromInternalKey(first_internal_key_buf, current_block_first_internal_key_, ts_sz_); - first_internal_key = first_internal_key_buf; + first_internal_key = *first_internal_key_buf; } + return first_internal_key; + } + + void AddIndexEntryImpl(const Slice& separator_with_seq, + const Slice& first_internal_key, + const BlockHandle& block_handle, + bool must_use_separator_with_seq, + bool skip_delta_encoding) { IndexValue entry(block_handle, first_internal_key); std::string encoded_entry; std::string delta_encoded_entry; entry.EncodeTo(&encoded_entry, include_first_key_, nullptr); - if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) { + if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull() && + !skip_delta_encoding) { entry.EncodeTo(&delta_encoded_entry, include_first_key_, &last_encoded_handle_); } else { @@ -252,21 +335,98 @@ class ShortenedIndexBuilder : public IndexBuilder { // away the UDT from key in index block as data block does the same thing. // What are the implications if a "FindShortInternalKeySuccessor" // optimization is provided. - index_block_builder_.Add(separator, encoded_entry, - &delta_encoded_entry_slice); - if (!seperator_is_key_plus_seq_) { + index_block_builder_.Add(separator_with_seq, encoded_entry, + &delta_encoded_entry_slice, skip_delta_encoding); + if (!must_use_separator_with_seq) { index_block_builder_without_seq_.Add( - ExtractUserKey(separator), encoded_entry, &delta_encoded_entry_slice); + ExtractUserKey(separator_with_seq), encoded_entry, + &delta_encoded_entry_slice, skip_delta_encoding); + } + + ++num_index_entries_; + UpdateIndexSizeEstimate(); + } + + Slice AddIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle, + std::string* separator_scratch, + bool skip_delta_encoding) override { + Slice separator_with_seq = GetSeparatorWithSeq( + last_key_in_current_block, first_key_in_next_block, separator_scratch); + + std::string first_internal_key_buf; + Slice first_internal_key = GetFirstInternalKey(&first_internal_key_buf); + + AddIndexEntryImpl(separator_with_seq, first_internal_key, block_handle, + must_use_separator_with_seq_.LoadRelaxed(), + skip_delta_encoding); + current_block_first_internal_key_.clear(); + return separator_with_seq; + } + + struct ShortenedPreparedIndexEntry : public PreparedIndexEntry { + std::string separator_with_seq; + std::string first_internal_key; + bool must_use_separator_with_seq = false; + void SaveFrom(const Slice& from_separator, + const Slice& from_first_internal_key, + bool from_must_use_separator_with_seq) { + assert(from_separator.size() >= kNumInternalBytes); + if (from_separator.data() == separator_with_seq.data()) { + // No need to copy + assert(from_separator.size() == separator_with_seq.size()); + } else { + // Copy the separator + separator_with_seq.assign(from_separator.data(), from_separator.size()); + } + // first_internal_key is optional, so it may be empty. + assert(from_first_internal_key.empty() || + from_first_internal_key.size() >= kNumInternalBytes); + if (from_first_internal_key.data() == first_internal_key.data()) { + // No need to copy + assert(from_first_internal_key.size() == first_internal_key.size()); + } else { + // Copy the first internal key + first_internal_key.assign(from_first_internal_key.data(), + from_first_internal_key.size()); + } + must_use_separator_with_seq = from_must_use_separator_with_seq; } + }; + std::unique_ptr CreatePreparedIndexEntry() override { + return std::make_unique(); + } + + void PrepareIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + PreparedIndexEntry* out) override { + ShortenedPreparedIndexEntry* entry = + static_cast(out); + Slice separator = + GetSeparatorWithSeq(last_key_in_current_block, first_key_in_next_block, + &entry->separator_with_seq); + Slice first_internal_key = GetFirstInternalKey(&entry->first_internal_key); + entry->SaveFrom(separator, first_internal_key, + must_use_separator_with_seq_.LoadRelaxed()); current_block_first_internal_key_.clear(); - return separator; + } + + void FinishIndexEntry(const BlockHandle& block_handle, + PreparedIndexEntry* base_entry, + bool skip_delta_encoding) override { + ShortenedPreparedIndexEntry* entry = + static_cast(base_entry); + AddIndexEntryImpl(entry->separator_with_seq, entry->first_internal_key, + block_handle, entry->must_use_separator_with_seq, + skip_delta_encoding); } using IndexBuilder::Finish; Status Finish(IndexBlocks* index_blocks, const BlockHandle& /*last_partition_block_handle*/) override { - if (seperator_is_key_plus_seq_) { + if (must_use_separator_with_seq_.LoadRelaxed()) { index_blocks->index_block_contents = index_block_builder_.Finish(); } else { index_blocks->index_block_contents = @@ -278,8 +438,15 @@ class ShortenedIndexBuilder : public IndexBuilder { size_t IndexSize() const override { return index_size_; } - bool seperator_is_key_plus_seq() override { - return seperator_is_key_plus_seq_; + uint64_t CurrentIndexSizeEstimate() const override { + return estimated_index_size_.LoadRelaxed(); + } + + // Updates the cached size estimate to minimize CPU usage in hot path + void UpdateIndexSizeEstimate() override; + + bool separator_is_key_plus_seq() override { + return must_use_separator_with_seq_.LoadRelaxed(); } // Changes *key to a short string >= *key. @@ -297,13 +464,20 @@ class ShortenedIndexBuilder : public IndexBuilder { private: BlockBuilder index_block_builder_; + // TODO: consider optimizing to only one builder. When discovering that + // sequence numbers are needed, read existing entries without seq and rewrite + // them with seq (which should be trivial to populate since seq wasn't needed + // before). BlockBuilder index_block_builder_without_seq_; const bool use_value_delta_encoding_; - bool seperator_is_key_plus_seq_; + RelaxedAtomic must_use_separator_with_seq_; const bool include_first_key_; BlockBasedTableOptions::IndexShorteningMode shortening_mode_; BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle(); std::string current_block_first_internal_key_; + uint64_t num_index_entries_ = 0; + // Cache for index size estimate to avoid recalculating in hot path + RelaxedAtomic estimated_index_size_{0}; }; // HashIndexBuilder contains a binary-searchable primary index and the @@ -351,14 +525,35 @@ class HashIndexBuilder : public IndexBuilder { Slice AddIndexEntry(const Slice& last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle, - std::string* separator_scratch) override { + std::string* separator_scratch, + bool skip_delta_encoding) override { ++current_restart_index_; return primary_index_builder_.AddIndexEntry( last_key_in_current_block, first_key_in_next_block, block_handle, - separator_scratch); + separator_scratch, skip_delta_encoding); + } + + std::unique_ptr CreatePreparedIndexEntry() override { + return primary_index_builder_.CreatePreparedIndexEntry(); + } + + void PrepareIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + PreparedIndexEntry* out) override { + ++current_restart_index_; + primary_index_builder_.PrepareIndexEntry(last_key_in_current_block, + first_key_in_next_block, out); + } + + void FinishIndexEntry(const BlockHandle& block_handle, + PreparedIndexEntry* entry, + bool skip_delta_encoding) override { + primary_index_builder_.FinishIndexEntry(block_handle, entry, + skip_delta_encoding); } - void OnKeyAdded(const Slice& key) override { + void OnKeyAdded(const Slice& key, + const std::optional& /*value*/) override { auto key_prefix = hash_key_extractor_->Transform(key); bool is_first_entry = pending_block_num_ == 0; @@ -393,9 +588,9 @@ class HashIndexBuilder : public IndexBuilder { Status s = primary_index_builder_.Finish(index_blocks, last_partition_block_handle); index_blocks->meta_blocks.insert( - {kHashIndexPrefixesBlock.c_str(), prefix_block_}); - index_blocks->meta_blocks.insert( - {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); + {kHashIndexPrefixesBlock.c_str(), {BlockType::kIndex, prefix_block_}}); + index_blocks->meta_blocks.insert({kHashIndexPrefixesMetadataBlock.c_str(), + {BlockType::kIndex, prefix_meta_block_}}); return s; } @@ -404,8 +599,10 @@ class HashIndexBuilder : public IndexBuilder { prefix_meta_block_.size(); } - bool seperator_is_key_plus_seq() override { - return primary_index_builder_.seperator_is_key_plus_seq(); + uint64_t CurrentIndexSizeEstimate() const override { return 0; } + + bool separator_is_key_plus_seq() override { + return primary_index_builder_.separator_is_key_plus_seq(); } private: @@ -461,7 +658,17 @@ class PartitionedIndexBuilder : public IndexBuilder { Slice AddIndexEntry(const Slice& last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle, - std::string* separator_scratch) override; + std::string* separator_scratch, + bool skip_delta_encoding) override; + + std::unique_ptr CreatePreparedIndexEntry() override; + void PrepareIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + PreparedIndexEntry* out) override; + void FinishIndexEntry(const BlockHandle& block_handle, + PreparedIndexEntry* entry, + bool skip_delta_encoding) override; + void MaybeFlush(const Slice& index_key, const BlockHandle& index_value); Status Finish(IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) override; @@ -470,6 +677,12 @@ class PartitionedIndexBuilder : public IndexBuilder { size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; } size_t NumPartitions() const; + // Returns a cached estimate of the current index size. This + // estimate is updated when data blocks are added. + uint64_t CurrentIndexSizeEstimate() const override { + return estimated_index_size_.LoadRelaxed(); + } + inline bool ShouldCutFilterBlock() { // Current policy is to align the partitions of index and filters if (cut_filter_block) { @@ -488,8 +701,10 @@ class PartitionedIndexBuilder : public IndexBuilder { // cutting the next partition void RequestPartitionCut(); - bool seperator_is_key_plus_seq() override { - return seperator_is_key_plus_seq_; + // This function must be thread safe because multiple worker threads might + // update the index builder state during parallel compression. + bool separator_is_key_plus_seq() override { + return must_use_separator_with_seq_.LoadRelaxed(); } bool get_use_value_delta_encoding() const { @@ -503,6 +718,7 @@ class PartitionedIndexBuilder : public IndexBuilder { size_t partition_cnt_ = 0; void MakeNewSubIndexBuilder(); + void UpdateIndexSizeEstimate() override; struct Entry { std::string key; @@ -515,14 +731,14 @@ class PartitionedIndexBuilder : public IndexBuilder { std::list entries_; BlockBuilder index_block_builder_; // top-level index builder BlockBuilder index_block_builder_without_seq_; // same for user keys - // the active partition index builder - std::unique_ptr sub_index_builder_; + // the active partition index builder (owned by an Entry in entries_) + ShortenedIndexBuilder* sub_index_builder_; // the last key in the active partition index builder - std::unique_ptr flush_policy_; + std::unique_ptr flush_policy_; // true if Finish is called once but not complete yet. - bool finishing_indexes = false; + bool finishing_indexes_ = false; const BlockBasedTableOptions& table_opt_; - bool seperator_is_key_plus_seq_; + RelaxedAtomic must_use_separator_with_seq_; bool use_value_delta_encoding_; // true if an external entity (such as filter partition builder) request // cutting the next partition @@ -530,5 +746,9 @@ class PartitionedIndexBuilder : public IndexBuilder { // true if it should cut the next filter partition block bool cut_filter_block = false; BlockHandle last_encoded_handle_; + // Cached estimate of current index size, updated when data blocks are added + RelaxedAtomic estimated_index_size_{0}; + // Running estimate of completed partitions total size + RelaxedAtomic estimated_completed_partitions_size_{0}; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/index_reader_common.cc b/table/block_based/index_reader_common.cc index 2c0b480e2f3f..6b0a6ab71dce 100644 --- a/table/block_based/index_reader_common.cc +++ b/table/block_based/index_reader_common.cc @@ -26,9 +26,9 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( assert(rep != nullptr); const Status s = table->RetrieveBlock( - prefetch_buffer, read_options, rep->index_handle, - UncompressionDict::GetEmptyDict(), &index_block->As(), - get_context, lookup_context, /* for_compaction */ false, use_cache, + prefetch_buffer, read_options, rep->index_handle, rep->decompressor.get(), + &index_block->As(), get_context, lookup_context, + /* for_compaction */ false, use_cache, /* async_read */ false, /* use_block_cache_for_lookup */ true); return s; diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h index 13f3dfaee14b..481589076f4a 100644 --- a/table/block_based/mock_block_based_table.h +++ b/table/block_based/mock_block_based_table.h @@ -32,7 +32,7 @@ class MockBlockBasedTableTester { explicit MockBlockBasedTableTester(const FilterPolicy* filter_policy) : MockBlockBasedTableTester( - std::shared_ptr(filter_policy)){}; + std::shared_ptr(filter_policy)) {}; explicit MockBlockBasedTableTester( std::shared_ptr filter_policy) diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index ce0b691a47f3..95c1cf32a2e8 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -143,6 +143,7 @@ void PartitionedFilterBlockBuilder::CutAFilterBlock(const Slice* next_key, ikey = p_index_builder_->GetPartitionKey(); } filters_.push_back({std::move(ikey), std::move(filter_data), filter}); + completed_partitions_size_.FetchAddRelaxed(filter.size()); partitioned_filters_construction_status_.UpdateIfOk( filter_construction_status); @@ -209,6 +210,56 @@ size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() { return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded(); } +size_t PartitionedFilterBlockBuilder::CurrentFilterSizeEstimate() { + size_t active_partition_size = + filter_bits_builder_->EstimateEntriesAdded() * 2; // 2 bytes per key + + return estimated_filter_size_.LoadRelaxed() + active_partition_size; +} + +void PartitionedFilterBlockBuilder::OnDataBlockFinalized( + uint64_t num_data_blocks) { + UpdateFilterSizeEstimate(num_data_blocks); +} + +void PartitionedFilterBlockBuilder::UpdateFilterSizeEstimate( + uint64_t num_data_blocks) { + size_t partitions_size = completed_partitions_size_.LoadRelaxed(); + + // Reserve space if no partitions have been cut + size_t active_filter_estimate = 0; + if (partitions_size == 0) { + size_t avg_bytes_per_entry = + 2; // 2 bytes per entry, approx 15 bits per key + + // Estimate using keys_per_partition_ since we expect to cut the first + // partition once it reaches approx. this many entries. + active_filter_estimate = keys_per_partition_ * avg_bytes_per_entry; + + // Add a 2x buffer (for top-level index, etc.) + active_filter_estimate = active_filter_estimate * 2; + } + size_t filter_estimate = std::max(partitions_size, active_filter_estimate); + + // Estimate top-level partition index size + if (p_index_builder_->separator_is_key_plus_seq()) { + filter_estimate += index_on_filter_block_builder_.CurrentSizeEstimate(); + } else { + filter_estimate += + index_on_filter_block_builder_without_seq_.CurrentSizeEstimate(); + } + + // Reserve filter space for the next data block + size_t reserved = 0; + if (num_data_blocks > 0) { + reserved = (filter_estimate / num_data_blocks) * + 2; // 2x average size per data block + estimated_filter_size_.StoreRelaxed(filter_estimate + reserved); + } else { + estimated_filter_size_.StoreRelaxed(filter_estimate); + } +} + void PartitionedFilterBlockBuilder::PrevKeyBeforeFinish( const Slice& prev_key_without_ts) { assert(prev_key_without_ts.compare(DEBUG_add_with_prev_key_called_ @@ -240,7 +291,7 @@ Status PartitionedFilterBlockBuilder::Finish( index_on_filter_block_builder_.Add(e.ikey, handle_encoding, &handle_delta_encoding_slice); - if (!p_index_builder_->seperator_is_key_plus_seq()) { + if (!p_index_builder_->separator_is_key_plus_seq()) { index_on_filter_block_builder_without_seq_.Add( ExtractUserKey(e.ikey), handle_encoding, &handle_delta_encoding_slice); @@ -267,7 +318,7 @@ Status PartitionedFilterBlockBuilder::Finish( if (UNLIKELY(filters_.empty())) { if (!index_on_filter_block_builder_.empty()) { // Simplest to just add them all at the end - if (p_index_builder_->seperator_is_key_plus_seq()) { + if (p_index_builder_->separator_is_key_plus_seq()) { *filter = index_on_filter_block_builder_.Finish(); } else { *filter = index_on_filter_block_builder_without_seq_.Finish(); @@ -413,8 +464,7 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock( const Status s = table()->RetrieveBlock( prefetch_buffer, read_options, fltr_blk_handle, - UncompressionDict::GetEmptyDict(), filter_block, get_context, - lookup_context, + /* decomp */ nullptr, filter_block, get_context, lookup_context, /* for_compaction */ false, /* use_cache */ true, /* async_read */ false, /* use_block_cache_for_lookup */ true); @@ -592,7 +642,8 @@ Status PartitionedFilterBlockReader::CacheDependencies( /*usage=*/FilePrefetchBufferUsage::kUnknown); IOOptions opts; - s = rep->file->PrepareIOOptions(ro, opts); + IODebugContext dbg; + s = rep->file->PrepareIOOptions(ro, opts, &dbg); if (s.ok()) { s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, static_cast(prefetch_len)); @@ -610,7 +661,7 @@ Status PartitionedFilterBlockReader::CacheDependencies( // filter blocks s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, - handle, UncompressionDict::GetEmptyDict(), + handle, /* dict */ nullptr, /* for_compaction */ false, &block, nullptr /* get_context */, &lookup_context, nullptr /* contents */, false, /* use_block_cache_for_lookup */ true); diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 8faed24a92db..96f39dd4f01a 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -18,6 +18,7 @@ #include "table/block_based/filter_block_reader_common.h" #include "table/block_based/full_filter_block.h" #include "table/block_based/index_builder.h" +#include "util/atomic.h" #include "util/autovector.h" #include "util/hash_containers.h" @@ -46,6 +47,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { } size_t EstimateEntriesAdded() override; + size_t CurrentFilterSizeEstimate() override; + void OnDataBlockFinalized(uint64_t num_data_blocks) override; void PrevKeyBeforeFinish(const Slice& prev_key_without_ts) override; Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter, @@ -67,6 +70,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { return Status::OK(); } + protected: + // Needs to be thread-safe to be invoked from background worker + // thread when parallel compression is enabled. + void UpdateFilterSizeEstimate(uint64_t num_data_blocks) override; + private: // fns // Whether to cut a filter block before the next key bool DecideCutAFilterBlock(); @@ -92,6 +100,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { }; std::deque filters_; // list of partitioned filters and keys // used in building the index + // Running total of completed filter partition sizes to avoid + // iterating over filters_ deque, which can be concurrently modified by + // the main thread when parallel compression is enabled. + RelaxedAtomic completed_partitions_size_{0}; + // The desired number of keys per partition uint32_t keys_per_partition_; // According to the bits builders, how many keys/prefixes added @@ -107,6 +120,12 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { // For Add without prev key std::string prev_key_without_ts_; + // Cached filter size estimate for hot path performance - updated only when + // data blocks are written for meaningful estimate updates. + // Must be atomic since UpdateFilterSizeEstimate() can be called from + // background worker threads when parallel compression is enabled. + RelaxedAtomic estimated_filter_size_{0}; + #ifndef NDEBUG // For verifying accurate previous keys are provided by the caller, so that // release code can be fast diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 80cb131a990b..02869a879c61 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -27,7 +27,7 @@ class MockedBlockBasedTable : public BlockBasedTable { MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib) : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) { // Initialize what Open normally does as much as necessary for the test - rep->index_key_includes_seq = pib->seperator_is_key_plus_seq(); + rep->index_key_includes_seq = pib->separator_is_key_plus_seq(); rep->index_value_is_full = !pib->get_use_value_delta_encoding(); } }; @@ -315,7 +315,8 @@ class PartitionedFilterBlockTest std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep()); BlockHandle dont_care_block_handle(1, 1); std::string scratch; - builder->AddIndexEntry(key, nullptr, dont_care_block_handle, &scratch); + builder->AddIndexEntry(key, nullptr, dont_care_block_handle, &scratch, + false); } void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key, @@ -327,7 +328,8 @@ class PartitionedFilterBlockTest BlockHandle dont_care_block_handle(1, 1); Slice slice = Slice(next_key.data(), next_key.size()); std::string scratch; - builder->AddIndexEntry(key, &slice, dont_care_block_handle, &scratch); + builder->AddIndexEntry(key, &slice, dont_care_block_handle, &scratch, + false); } int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) { @@ -348,7 +350,7 @@ INSTANTIATE_TEST_CASE_P( FormatVersions, PartitionedFilterBlockTest, testing::Combine( testing::ValuesIn(std::set{ - 2, 3, 4, 5, test::kDefaultFormatVersion, kLatestFormatVersion}), + 2, 3, 4, 5, test::kDefaultFormatVersion, kLatestBbtFormatVersion}), testing::ValuesIn(test::GetUDTTestModes()), testing::Bool())); TEST_P(PartitionedFilterBlockTest, EmptyBuilder) { diff --git a/table/block_based/partitioned_index_iterator.h b/table/block_based/partitioned_index_iterator.h index 6412fe2399b5..31ccded9a025 100644 --- a/table/block_based/partitioned_index_iterator.h +++ b/table/block_based/partitioned_index_iterator.h @@ -81,8 +81,6 @@ class PartitionedIndexIterator : public InternalIteratorBase { } } inline IterBoundCheck UpperBoundCheckResult() override { - // Shouldn't be called. - assert(false); return IterBoundCheck::kUnknown; } void SetPinnedItersMgr(PinnedIteratorsManager*) override { diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index 04c73ba0bbec..da3f3658da59 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -190,7 +190,7 @@ Status PartitionIndexReader::CacheDependencies( // filter blocks Status s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, - handle, UncompressionDict::GetEmptyDict(), + handle, rep->decompressor.get(), /*for_compaction=*/false, &block.As(), /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr, /*async_read=*/false, /*use_block_cache_for_lookup=*/true); diff --git a/table/block_based/reader_common.cc b/table/block_based/reader_common.cc index 8f8c82ff43ac..fbafe414dd9a 100644 --- a/table/block_based/reader_common.cc +++ b/table/block_based/reader_common.cc @@ -25,7 +25,7 @@ void ForceReleaseCachedEntry(void* arg, void* h) { // WART: this is specific to block-based table Status VerifyBlockChecksum(const Footer& footer, const char* data, size_t block_size, const std::string& file_name, - uint64_t offset) { + uint64_t offset, BlockType block_type) { PERF_TIMER_GUARD(block_checksum_time); assert(footer.GetBlockTrailerSize() == 5); @@ -58,7 +58,8 @@ Status VerifyBlockChecksum(const Footer& footer, const char* data, std::string(modifier ? "(context removed)" : "") + " = " + std::to_string(stored) + ", computed = " + std::to_string(computed) + ", type = " + std::to_string(type) + " in " + file_name + " offset " + - std::to_string(offset) + " size " + std::to_string(block_size)); + std::to_string(offset) + " size " + std::to_string(block_size) + + ", block_type = " + BlockTypeToString(block_type)); } } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/reader_common.h b/table/block_based/reader_common.h index 89518fd8c2a4..6d16f4069413 100644 --- a/table/block_based/reader_common.h +++ b/table/block_based/reader_common.h @@ -10,6 +10,7 @@ #include "rocksdb/advanced_cache.h" #include "rocksdb/table.h" +#include "table/block_based/block_type.h" namespace ROCKSDB_NAMESPACE { class Footer; @@ -27,10 +28,12 @@ inline MemoryAllocator* GetMemoryAllocator( // Assumes block has a trailer past `data + block_size` as in format.h. // `file_name` provided for generating diagnostic message in returned status. // `offset` might be required for proper verification (also used for message). +// `block_type` is included in the error message to provide context about +// which type of block failed checksum verification. // // Returns Status::OK() on checksum match, or Status::Corruption() on checksum // mismatch. Status VerifyBlockChecksum(const Footer& footer, const char* data, size_t block_size, const std::string& file_name, - uint64_t offset); + uint64_t offset, BlockType block_type); } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc index b7c9e02f01ba..2a6b25aaa5ee 100644 --- a/table/block_based/uncompression_dict_reader.cc +++ b/table/block_based/uncompression_dict_reader.cc @@ -23,7 +23,7 @@ Status UncompressionDictReader::Create( assert(!pin || prefetch); assert(uncompression_dict_reader); - CachableEntry uncompression_dict; + CachableEntry uncompression_dict; if (prefetch || !use_cache) { const Status s = ReadUncompressionDictionary( table, prefetch_buffer, ro, use_cache, nullptr /* get_context */, @@ -47,7 +47,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary( const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, bool use_cache, GetContext* get_context, BlockCacheLookupContext* lookup_context, - CachableEntry* uncompression_dict) { + CachableEntry* uncompression_dict) { // TODO: add perf counter for compression dictionary read time assert(table); @@ -60,8 +60,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary( const Status s = table->RetrieveBlock( prefetch_buffer, read_options, rep->compression_dict_handle, - UncompressionDict::GetEmptyDict(), uncompression_dict, get_context, - lookup_context, + /* decomp */ nullptr, uncompression_dict, get_context, lookup_context, /* for_compaction */ false, use_cache, /* async_read */ false, /* use_block_cache_for_lookup */ true); @@ -79,7 +78,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary( Status UncompressionDictReader::GetOrReadUncompressionDictionary( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, GetContext* get_context, BlockCacheLookupContext* lookup_context, - CachableEntry* uncompression_dict) const { + CachableEntry* uncompression_dict) const { assert(uncompression_dict); if (!uncompression_dict_.IsEmpty()) { diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h index b5d64dbf1458..d0579a66055c 100644 --- a/table/block_based/uncompression_dict_reader.h +++ b/table/block_based/uncompression_dict_reader.h @@ -18,7 +18,6 @@ struct BlockCacheLookupContext; class FilePrefetchBuffer; class GetContext; struct ReadOptions; -struct UncompressionDict; // Provides access to the uncompression dictionary regardless of whether // it is owned by the reader or stored in the cache, or whether it is pinned @@ -34,13 +33,13 @@ class UncompressionDictReader { Status GetOrReadUncompressionDictionary( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, GetContext* get_context, BlockCacheLookupContext* lookup_context, - CachableEntry* uncompression_dict) const; + CachableEntry* uncompression_dict) const; size_t ApproximateMemoryUsage() const; private: UncompressionDictReader(const BlockBasedTable* t, - CachableEntry&& uncompression_dict) + CachableEntry&& uncompression_dict) : table_(t), uncompression_dict_(std::move(uncompression_dict)) { assert(table_); } @@ -51,10 +50,10 @@ class UncompressionDictReader { const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, bool use_cache, GetContext* get_context, BlockCacheLookupContext* lookup_context, - CachableEntry* uncompression_dict); + CachableEntry* uncompression_dict); const BlockBasedTable* table_; - CachableEntry uncompression_dict_; + CachableEntry uncompression_dict_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h new file mode 100644 index 000000000000..b65ba147e2fc --- /dev/null +++ b/table/block_based/user_defined_index_wrapper.h @@ -0,0 +1,326 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/user_defined_index.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_type.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/index_builder.h" + +namespace ROCKSDB_NAMESPACE { + +// UserDefinedIndexWrapper wraps around the existing index types in block based +// table, and supports plugging in an additional user defined index. The wrapper +// class forwards calls to both the wrapped internal index, and a user defined +// index builder. +class UserDefinedIndexBuilderWrapper : public IndexBuilder { + public: + UserDefinedIndexBuilderWrapper( + const std::string& name, + std::unique_ptr internal_index_builder, + std::unique_ptr user_defined_index_builder, + const InternalKeyComparator* comparator, size_t ts_sz, + bool persist_user_defined_timestamps) + : IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps), + name_(name), + internal_index_builder_(std::move(internal_index_builder)), + user_defined_index_builder_(std::move(user_defined_index_builder)) {} + + ~UserDefinedIndexBuilderWrapper() override = default; + + Slice AddIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle, + std::string* separator_scratch, + bool skip_delta_encoding) override { + UserDefinedIndexBuilder::BlockHandle handle; + handle.offset = block_handle.offset(); + handle.size = block_handle.size(); + // Forward the call to both index builders + ParsedInternalKey pkey_last; + ParsedInternalKey pkey_first; + // There's no way to return an error here, so we remember the statsu and + // return it in Finish() + if (status_.ok()) { + status_ = ParseInternalKey(last_key_in_current_block, &pkey_last, + /*lof_err_key*/ false); + } + if (status_.ok() && first_key_in_next_block) { + status_ = ParseInternalKey(*first_key_in_next_block, &pkey_first, + /*lof_err_key*/ false); + } + if (status_.ok()) { + user_defined_index_builder_->AddIndexEntry( + pkey_last.user_key, + first_key_in_next_block ? &pkey_first.user_key : nullptr, handle, + separator_scratch); + } + return internal_index_builder_->AddIndexEntry( + last_key_in_current_block, first_key_in_next_block, block_handle, + separator_scratch, skip_delta_encoding); + } + + // Not supported with parallel compression + std::unique_ptr CreatePreparedIndexEntry() override { + return nullptr; + } + void PrepareIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + PreparedIndexEntry* out) override { + (void)last_key_in_current_block; + (void)first_key_in_next_block; + (void)out; + assert(false); + } + void FinishIndexEntry(const BlockHandle& block_handle, + PreparedIndexEntry* entry, + bool skip_delta_encoding) override { + (void)block_handle; + (void)entry; + (void)skip_delta_encoding; + assert(false); + } + + void OnKeyAdded(const Slice& key, + const std::optional& value) override { + ParsedInternalKey pkey; + if (status_.ok()) { + if (!value.has_value()) { + status_ = Status::InvalidArgument( + "user_defined_index_factory not supported with parallel " + "compression"); + } else { + status_ = ParseInternalKey(key, &pkey, /*lof_err_key*/ false); + if (status_.ok() && pkey.type != ValueType::kTypeValue) { + status_ = Status::InvalidArgument( + "user_defined_index_factory only supported with Puts"); + } + } + } + if (!status_.ok()) { + return; + } + + // Forward the call to both index builders + internal_index_builder_->OnKeyAdded(key, value); + + // Pass the user key to the UDI. We don't expect multiple entries with + // different sequence numbers for the same key in the file. RocksDB may + // enforce it in the future by allowing UDIs only for read only + // bulkloaded use cases, and only allow ingestion of files with + // sequence number 0. + user_defined_index_builder_->OnKeyAdded( + pkey.user_key, UserDefinedIndexBuilder::ValueType::kValue, + value.value()); + } + + Status Finish(IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override { + if (!status_.ok() && !status_.IsIncomplete()) { + return status_; + } + + if (!udi_finished_) { + // Finish the user defined index builder + Slice user_index_contents; + status_ = user_defined_index_builder_->Finish(&user_index_contents); + if (!status_.ok()) { + return status_; + } + + // Add the user defined index to the meta blocks + std::string block_name = kUserDefinedIndexPrefix + name_; + index_blocks->meta_blocks.insert( + {block_name, {BlockType::kUserDefinedIndex, user_index_contents}}); + udi_finished_ = true; + } + + // Finish the internal index builder + status_ = internal_index_builder_->Finish(index_blocks, + last_partition_block_handle); + if (!status_.ok()) { + return status_; + } + + index_size_ = internal_index_builder_->IndexSize(); + return status_; + } + + size_t IndexSize() const override { return index_size_; } + + uint64_t CurrentIndexSizeEstimate() const override { return 0; } + + bool separator_is_key_plus_seq() override { + return internal_index_builder_->separator_is_key_plus_seq(); + } + + private: + const std::string name_; + std::unique_ptr internal_index_builder_; + std::unique_ptr user_defined_index_builder_; + Status status_; + bool udi_finished_ = false; +}; + +class UserDefinedIndexIteratorWrapper + : public InternalIteratorBase { + public: + explicit UserDefinedIndexIteratorWrapper( + std::unique_ptr&& udi_iter) + : udi_iter_(std::move(udi_iter)), valid_(false) {} + + bool Valid() const override { return valid_; } + + void SeekToFirst() override { + status_ = Status::NotSupported("SeekToFirst not supported"); + } + + void SeekToLast() override { + status_ = Status::NotSupported("SeekToLast not supported"); + } + + void Seek(const Slice& target) override { + ParsedInternalKey pkey; + status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false); + if (status_.ok()) { + status_ = udi_iter_->SeekAndGetResult(pkey.user_key, &result_); + } + if (status_.ok()) { + valid_ = result_.bound_check_result == IterBoundCheck::kInbound; + if (valid_) { + ikey_.Set(result_.key, 0, ValueType::kTypeValue); + } + } else { + valid_ = false; + } + } + + void Next() override { + status_ = udi_iter_->NextAndGetResult(&result_); + if (status_.ok()) { + valid_ = result_.bound_check_result == IterBoundCheck::kInbound; + if (valid_) { + ikey_.Set(result_.key, 0, ValueType::kTypeValue); + } + } else { + valid_ = false; + } + } + + bool NextAndGetResult(IterateResult* result) override { + status_ = udi_iter_->NextAndGetResult(&result_); + if (status_.ok()) { + valid_ = result_.bound_check_result == IterBoundCheck::kInbound; + if (valid_) { + ikey_.Set(result_.key, 0, ValueType::kTypeValue); + } + if (status_.ok()) { + *result = result_; + } + } else { + valid_ = false; + } + return valid_; + } + + void SeekForPrev(const Slice& /*target*/) override { + status_ = Status::NotSupported("SeekForPrev not supported"); + } + + void Prev() override { status_ = Status::NotSupported("Prev not supported"); } + + Slice key() const override { return Slice(*ikey_.const_rep()); } + + IndexValue value() const override { + auto handle = udi_iter_->value(); + IndexValue val(BlockHandle(handle.offset, handle.size), Slice()); + return val; + } + + Status status() const override { return status_; } + + void Prepare(const MultiScanArgs* scan_opts) override { + if (scan_opts) { + udi_iter_->Prepare(scan_opts->GetScanRanges().data(), + scan_opts->GetScanRanges().size()); + } + } + + IterBoundCheck UpperBoundCheckResult() override { + return result_.bound_check_result; + } + + private: + std::unique_ptr udi_iter_; + IterateResult result_; + InternalKey ikey_; + Status status_; + bool valid_; +}; + +class UserDefinedIndexReaderWrapper : public BlockBasedTable::IndexReader { + public: + UserDefinedIndexReaderWrapper( + const std::string& name, + std::unique_ptr&& reader, + std::unique_ptr&& udi_reader) + : name_(name), + reader_(std::move(reader)), + udi_reader_(std::move(udi_reader)) {} + + virtual InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { + if (!read_options.table_index_factory) { + return reader_->NewIterator(read_options, disable_prefix_seek, iter, + get_context, lookup_context); + } + if (name_ != read_options.table_index_factory->Name()) { + return NewErrorInternalIterator(Status::InvalidArgument( + "Bad index name" + + std::string(read_options.table_index_factory->Name()) + + ". Only supported UDI is " + name_)); + } + std::unique_ptr udi_iter = + udi_reader_->NewIterator(read_options); + if (udi_iter) { + InternalIteratorBase* wrap_iter = + new UserDefinedIndexIteratorWrapper(std::move(udi_iter)); + return wrap_iter; + } + return NewErrorInternalIterator( + Status::NotFound("COuld not create UDI iterator")); + } + + virtual Status CacheDependencies( + const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override { + return reader_->CacheDependencies(ro, pin, tail_prefetch_buffer); + } + + size_t ApproximateMemoryUsage() const override { + return reader_->ApproximateMemoryUsage(); + } + + virtual void EraseFromCacheBeforeDestruction( + uint32_t uncache_aggressiveness) override { + reader_->EraseFromCacheBeforeDestruction(uncache_aggressiveness); + } + + private: + std::string name_; + std::unique_ptr reader_; + std::unique_ptr udi_reader_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 0637440bdcf9..2f4ee64b19fc 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -33,20 +33,20 @@ inline void BlockFetcher::ProcessTrailerIfPresent() { if (footer_.GetBlockTrailerSize() > 0) { assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize); if (read_options_.verify_checksums) { - io_status_ = status_to_io_status( - VerifyBlockChecksum(footer_, slice_.data(), block_size_, - file_->file_name(), handle_.offset())); + io_status_ = status_to_io_status(VerifyBlockChecksum( + footer_, slice_.data(), block_size_, file_->file_name(), + handle_.offset(), block_type_)); RecordTick(ioptions_.stats, BLOCK_CHECKSUM_COMPUTE_COUNT); if (!io_status_.ok()) { assert(io_status_.IsCorruption()); RecordTick(ioptions_.stats, BLOCK_CHECKSUM_MISMATCH_COUNT); } } - compression_type_ = + compression_type() = BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_); } else { // E.g. plain table or cuckoo table - compression_type_ = kNoCompression; + compression_type() = kNoCompression; } } @@ -74,7 +74,8 @@ inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() { inline bool BlockFetcher::TryGetFromPrefetchBuffer() { if (prefetch_buffer_ != nullptr) { IOOptions opts; - IOStatus io_s = file_->PrepareIOOptions(read_options_, opts); + IODebugContext dbg; + IOStatus io_s = file_->PrepareIOOptions(read_options_, opts, &dbg); if (io_s.ok()) { bool read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache( opts, file_, handle_.offset(), block_size_with_trailer_, &slice_, @@ -195,7 +196,7 @@ inline void BlockFetcher::CopyBufferToCompressedBuf() { } // Before - Entering this method means the block is uncompressed or do not need -// to be uncompressed. +// to be decompressed. // // The block can be in one of the following buffers: // 1. prefetch buffer if prefetch is enabled and the block is prefetched before @@ -219,14 +220,14 @@ inline void BlockFetcher::GetBlockContents() { if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) { CopyBufferToHeapBuf(); } else if (used_buf_ == compressed_buf_.get()) { - if (compression_type_ == kNoCompression && + if (compression_type() == kNoCompression && memory_allocator_ != memory_allocator_compressed_) { CopyBufferToHeapBuf(); } else { heap_buf_ = std::move(compressed_buf_); } } else if (direct_io_buf_.get() != nullptr || use_fs_scratch_) { - if (compression_type_ == kNoCompression) { + if (compression_type() == kNoCompression) { CopyBufferToHeapBuf(); } else { CopyBufferToCompressedBuf(); @@ -241,12 +242,13 @@ inline void BlockFetcher::GetBlockContents() { } // Read a block from the file and verify its checksum. Upon return, io_status_ -// will be updated with the status of the read, and slice_ will be updated -// with a pointer to the data. +// will be updated with the status of the read, and slice_ will be +// updated with a pointer to the data. void BlockFetcher::ReadBlock(bool retry) { FSReadRequest read_req; IOOptions opts; - io_status_ = file_->PrepareIOOptions(read_options_, opts); + IODebugContext dbg; + io_status_ = file_->PrepareIOOptions(read_options_, opts, &dbg); opts.verify_and_reconstruct_read = retry; read_req.status.PermitUncheckedError(); // Actual file read @@ -256,8 +258,9 @@ void BlockFetcher::ReadBlock(bool retry) { PERF_CPU_TIMER_GUARD( block_read_cpu_time, ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr); - io_status_ = file_->Read(opts, handle_.offset(), block_size_with_trailer_, - &slice_, /*scratch=*/nullptr, &direct_io_buf_); + io_status_ = + file_->Read(opts, handle_.offset(), block_size_with_trailer_, &slice_, + /*scratch=*/nullptr, &direct_io_buf_, &dbg); PERF_COUNTER_ADD(block_read_count, 1); used_buf_ = const_cast(slice_.data()); } else if (use_fs_scratch_) { @@ -269,7 +272,7 @@ void BlockFetcher::ReadBlock(bool retry) { read_req.len = block_size_with_trailer_; read_req.scratch = nullptr; io_status_ = file_->MultiRead(opts, &read_req, /*num_reqs=*/1, - /*AlignedBuf* =*/nullptr); + /*AlignedBuf* =*/nullptr, &dbg); PERF_COUNTER_ADD(block_read_count, 1); slice_ = Slice(read_req.result.data(), read_req.result.size()); @@ -283,9 +286,10 @@ void BlockFetcher::ReadBlock(bool retry) { block_read_cpu_time, ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr); - io_status_ = file_->Read( - opts, handle_.offset(), /*size*/ block_size_with_trailer_, - /*result*/ &slice_, /*scratch*/ used_buf_, /*aligned_buf=*/nullptr); + io_status_ = + file_->Read(opts, handle_.offset(), /*size*/ block_size_with_trailer_, + /*result*/ &slice_, /*scratch*/ used_buf_, + /*aligned_buf=*/nullptr, &dbg); PERF_COUNTER_ADD(block_read_count, 1); #ifndef NDEBUG if (slice_.data() == &stack_buf_[0]) { @@ -320,6 +324,7 @@ void BlockFetcher::ReadBlock(bool retry) { } PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_); + IGNORE_STATUS_IF_ERROR(io_status_); if (io_status_.ok()) { if (use_fs_scratch_ && !read_req.status.ok()) { io_status_ = read_req.status; @@ -356,7 +361,7 @@ void BlockFetcher::ReadBlock(bool retry) { IOStatus BlockFetcher::ReadBlockContents() { if (TryGetUncompressBlockFromPersistentCache()) { - compression_type_ = kNoCompression; + compression_type() = kNoCompression; #ifndef NDEBUG contents_->has_trailer = footer_.GetBlockTrailerSize() > 0; #endif // NDEBUG @@ -384,19 +389,16 @@ IOStatus BlockFetcher::ReadBlockContents() { } } - if (do_uncompress_ && compression_type_ != kNoCompression) { + if (do_uncompress_ && compression_type() != kNoCompression) { PERF_TIMER_GUARD(block_decompress_time); - // compressed page, uncompress, update cache - UncompressionContext context(compression_type_); - UncompressionInfo info(context, uncompression_dict_, compression_type_); - io_status_ = status_to_io_status(UncompressSerializedBlock( - info, slice_.data(), block_size_, contents_, footer_.format_version(), - ioptions_, memory_allocator_)); + // Process the compressed block without trailer + slice_.size_ = block_size_; + decomp_args_.compressed_data = slice_; + io_status_ = status_to_io_status(DecompressSerializedBlock( + decomp_args_, *decompressor_, contents_, ioptions_, memory_allocator_)); #ifndef NDEBUG num_heap_buf_memcpy_++; #endif - // Save the compressed block without trailer - slice_ = Slice(slice_.data(), block_size_); } else { GetBlockContents(); slice_ = Slice(); @@ -409,7 +411,7 @@ IOStatus BlockFetcher::ReadBlockContents() { IOStatus BlockFetcher::ReadAsyncBlockContents() { if (TryGetUncompressBlockFromPersistentCache()) { - compression_type_ = kNoCompression; + compression_type() = kNoCompression; #ifndef NDEBUG contents_->has_trailer = footer_.GetBlockTrailerSize() > 0; #endif // NDEBUG @@ -418,7 +420,8 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() { assert(prefetch_buffer_ != nullptr); if (!for_compaction_) { IOOptions opts; - IOStatus io_s = file_->PrepareIOOptions(read_options_, opts); + IODebugContext dbg; + IOStatus io_s = file_->PrepareIOOptions(read_options_, opts, &dbg); if (!io_s.ok()) { return io_s; } @@ -441,15 +444,14 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() { } used_buf_ = const_cast(slice_.data()); - if (do_uncompress_ && compression_type_ != kNoCompression) { + if (do_uncompress_ && compression_type() != kNoCompression) { PERF_TIMER_GUARD(block_decompress_time); - // compressed page, uncompress, update cache - UncompressionContext context(compression_type_); - UncompressionInfo info(context, uncompression_dict_, - compression_type_); - io_status_ = status_to_io_status(UncompressSerializedBlock( - info, slice_.data(), block_size_, contents_, - footer_.format_version(), ioptions_, memory_allocator_)); + // Process the compressed block without trailer + slice_.size_ = block_size_; + decomp_args_.compressed_data = slice_; + io_status_ = status_to_io_status( + DecompressSerializedBlock(decomp_args_, *decompressor_, contents_, + ioptions_, memory_allocator_)); #ifndef NDEBUG num_heap_buf_memcpy_++; #endif diff --git a/table/block_fetcher.h b/table/block_fetcher.h index 9441e0a73cae..76e59369f093 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -14,6 +14,7 @@ #include "table/block_based/block_type.h" #include "table/format.h" #include "table/persistent_cache_options.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { @@ -46,7 +47,7 @@ class BlockFetcher { BlockContents* contents, const ImmutableOptions& ioptions /* ref retained */, bool do_uncompress, bool maybe_compressed, BlockType block_type, - const UncompressionDict& uncompression_dict /* ref retained */, + UnownedPtr decompressor, const PersistentCacheOptions& cache_options /* ref retained */, MemoryAllocator* memory_allocator = nullptr, MemoryAllocator* memory_allocator_compressed = nullptr, @@ -63,7 +64,7 @@ class BlockFetcher { block_type_(block_type), block_size_(static_cast(handle_.size())), block_size_with_trailer_(block_size_ + footer.GetBlockTrailerSize()), - uncompression_dict_(uncompression_dict), + decompressor_(decompressor), cache_options_(cache_options), memory_allocator_(memory_allocator), memory_allocator_compressed_(memory_allocator_compressed), @@ -81,14 +82,17 @@ class BlockFetcher { IOStatus ReadBlockContents(); IOStatus ReadAsyncBlockContents(); - inline CompressionType get_compression_type() const { - return compression_type_; + inline CompressionType compression_type() const { + return decomp_args_.compression_type; + } + inline CompressionType& compression_type() { + return decomp_args_.compression_type; } inline size_t GetBlockSizeWithTrailer() const { return block_size_with_trailer_; } inline Slice& GetCompressedBlock() { - assert(compression_type_ != kNoCompression); + assert(compression_type() != kNoCompression); return slice_; } @@ -121,7 +125,7 @@ class BlockFetcher { const BlockType block_type_; const size_t block_size_; const size_t block_size_with_trailer_; - const UncompressionDict& uncompression_dict_; + UnownedPtr decompressor_; const PersistentCacheOptions& cache_options_; MemoryAllocator* memory_allocator_; MemoryAllocator* memory_allocator_compressed_; @@ -133,11 +137,11 @@ class BlockFetcher { CacheAllocationPtr compressed_buf_; char stack_buf_[kDefaultStackBufferSize]; bool got_from_prefetch_buffer_ = false; - CompressionType compression_type_; bool for_compaction_ = false; bool use_fs_scratch_ = false; bool retry_corrupt_read_ = false; FSAllocationPtr fs_buf_; + Decompressor::Args decomp_args_; // return true if found bool TryGetUncompressBlockFromPersistentCache(); diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 17310edec6ae..e3d5dff735fd 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -319,10 +319,11 @@ class BlockFetcherTest : public testing::Test { PersistentCacheOptions persistent_cache_options; Footer footer; ReadFooter(file, &footer); + auto mgr = GetBuiltinV2CompressionManager(); std::unique_ptr fetcher(new BlockFetcher( file, nullptr /* prefetch_buffer */, footer, roptions, block, contents, ioptions, do_uncompress, compressed, block_type, - UncompressionDict::GetEmptyDict(), persistent_cache_options, + mgr->GetDecompressor().get(), persistent_cache_options, heap_buf_allocator, compressed_buf_allocator)); ASSERT_OK(fetcher->ReadBlockContents()); @@ -335,7 +336,7 @@ class BlockFetcherTest : public testing::Test { if (do_uncompress) { *compression_type = kNoCompression; } else { - *compression_type = fetcher->get_compression_type(); + *compression_type = fetcher->compression_type(); } } diff --git a/table/cleanable_test.cc b/table/cleanable_test.cc index b58eb7dc61e2..c53571bf0077 100644 --- a/table/cleanable_test.cc +++ b/table/cleanable_test.cc @@ -31,7 +31,9 @@ void Multiplier(void* arg1, void* arg2) { TEST_F(CleanableTest, Register) { int n2 = 2, n3 = 3; int res = 1; - { Cleanable c1; } + { + Cleanable c1; + } // ~Cleanable ASSERT_EQ(1, res); diff --git a/table/external_table.cc b/table/external_table.cc new file mode 100644 index 000000000000..5fc20f406929 --- /dev/null +++ b/table/external_table.cc @@ -0,0 +1,487 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/external_table.h" + +#include "logging/logging.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/table_builder.h" +#include "table/table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class ExternalTableIteratorAdapter : public InternalIterator { + public: + explicit ExternalTableIteratorAdapter(ExternalTableIterator* iterator) + : iterator_(iterator), valid_(false) {} + + // No copying allowed + ExternalTableIteratorAdapter(const ExternalTableIteratorAdapter&) = delete; + ExternalTableIteratorAdapter& operator=(const ExternalTableIteratorAdapter&) = + delete; + + ~ExternalTableIteratorAdapter() override {} + + bool Valid() const override { return valid_; } + + void SeekToFirst() override { + status_ = Status::OK(); + if (iterator_) { + iterator_->SeekToFirst(); + UpdateKey(OptSlice()); + } + } + + void SeekToLast() override { + status_ = Status::OK(); + if (iterator_) { + iterator_->SeekToLast(); + UpdateKey(OptSlice()); + } + } + + void Seek(const Slice& target) override { + status_ = Status::OK(); + if (iterator_) { + ParsedInternalKey pkey; + status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false); + if (status_.ok()) { + iterator_->Seek(pkey.user_key); + UpdateKey(OptSlice()); + } + } + } + + void SeekForPrev(const Slice& target) override { + status_ = Status::OK(); + if (iterator_) { + ParsedInternalKey pkey; + status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false); + if (status_.ok()) { + iterator_->SeekForPrev(pkey.user_key); + UpdateKey(OptSlice()); + } + } + } + + void Next() override { + if (iterator_) { + iterator_->Next(); + UpdateKey(OptSlice()); + } + } + + bool NextAndGetResult(IterateResult* result) override { + if (iterator_) { + valid_ = iterator_->NextAndGetResult(&result_); + result->value_prepared = result_.value_prepared; + result->bound_check_result = result_.bound_check_result; + if (valid_) { + UpdateKey(result_.key); + result->key = key(); + } + } else { + valid_ = false; + } + return valid_; + } + + bool PrepareValue() override { + if (iterator_ && !result_.value_prepared) { + valid_ = iterator_->PrepareValue(); + result_.value_prepared = true; + } + return valid_; + } + + IterBoundCheck UpperBoundCheckResult() override { + if (iterator_) { + result_.bound_check_result = iterator_->UpperBoundCheckResult(); + } + return result_.bound_check_result; + } + + void Prev() override { + if (iterator_) { + iterator_->Prev(); + UpdateKey(OptSlice()); + } + } + + Slice key() const override { + if (iterator_) { + return Slice(*key_.const_rep()); + } + return Slice(); + } + + Slice value() const override { + if (iterator_) { + return iterator_->value(); + } + return Slice(); + } + + Status status() const override { return status_; } + + void Prepare(const MultiScanArgs* scan_opts) override { + if (iterator_ && scan_opts) { + iterator_->Prepare(scan_opts->GetScanRanges().data(), scan_opts->size()); + } else if (iterator_) { + iterator_->Prepare(nullptr, 0); + } + } + + private: + std::unique_ptr iterator_; + InternalKey key_; + bool valid_; + Status status_; + IterateResult result_; + + void UpdateKey(OptSlice res) { + if (iterator_) { + valid_ = iterator_->Valid(); + status_ = iterator_->status(); + if (valid_ && status_.ok()) { + key_.Set(res.has_value() ? res.value() : iterator_->key(), 0, + ValueType::kTypeValue); + } + } + } +}; + +class ExternalTableReaderAdapter : public TableReader { + public: + explicit ExternalTableReaderAdapter( + const ImmutableOptions& ioptions, + std::unique_ptr&& reader) + : ioptions_(ioptions), reader_(std::move(reader)) {} + + ~ExternalTableReaderAdapter() override {} + + // No copying allowed + ExternalTableReaderAdapter(const ExternalTableReaderAdapter&) = delete; + ExternalTableReaderAdapter& operator=(const ExternalTableReaderAdapter&) = + delete; + + InternalIterator* NewIterator( + const ReadOptions& read_options, const SliceTransform* prefix_extractor, + Arena* arena, bool /* skip_filters */, TableReaderCaller /* caller */, + size_t /* compaction_readahead_size */ = 0, + bool /* allow_unprepared_value */ = false) override { + auto iterator = reader_->NewIterator(read_options, prefix_extractor); + if (arena == nullptr) { + return new ExternalTableIteratorAdapter(iterator); + } else { + auto* mem = arena->AllocateAligned(sizeof(ExternalTableIteratorAdapter)); + return new (mem) ExternalTableIteratorAdapter(iterator); + } + } + + uint64_t ApproximateOffsetOf(const ReadOptions&, const Slice&, + TableReaderCaller) override { + return 0; + } + + uint64_t ApproximateSize(const ReadOptions&, const Slice&, const Slice&, + TableReaderCaller) override { + return 0; + } + + void SetupForCompaction() override {} + + std::shared_ptr GetTableProperties() const override { + std::shared_ptr props; + std::unique_ptr property_block; + uint64_t property_block_size = 0; + uint64_t property_block_offset = 0; + Status s; + // Get the raw properties block from the external table reader. We don't + // support writing the global sequence number, but we still get and return + // the correct global seqno offset in the file to prevent accidental + // corruption. + s = reader_->GetPropertiesBlock(&property_block, &property_block_size, + &property_block_offset); + if (s.ok()) { + std::unique_ptr table_properties = + std::make_unique(); + BlockContents block_contents(std::move(property_block), + property_block_size); + Block block(std::move(block_contents)); + s = ParsePropertiesBlock(ioptions_, property_block_offset, block, + table_properties); + if (s.ok()) { + props.reset(table_properties.release()); + } + } else { + // Fallback to getting a minimal table properties structure from the + // external table reader + props = std::make_shared(*reader_->GetTableProperties()); + props->key_largest_seqno = 0; + props->key_smallest_seqno = 0; + } + return props; + } + + size_t ApproximateMemoryUsage() const override { return 0; } + + Status Get(const ReadOptions&, const Slice&, GetContext*, + const SliceTransform*, bool = false) override { + return Status::NotSupported( + "Get() not supported on external file iterator"); + } + + Status VerifyChecksum(const ReadOptions& /*ro*/, TableReaderCaller /*caller*/, + bool /*meta_blocks_only*/ = false) override { + return Status::OK(); + } + + private: + const ImmutableOptions& ioptions_; + std::unique_ptr reader_; +}; + +class ExternalTableBuilderAdapter : public TableBuilder { + public: + explicit ExternalTableBuilderAdapter( + const TableBuilderOptions& topts, + std::unique_ptr&& builder, + std::unique_ptr&& file) + : builder_(std::move(builder)), + file_(std::move(file)), + ioptions_(topts.ioptions) { + properties_.num_data_blocks = 1; + properties_.index_size = 0; + properties_.filter_size = 0; + properties_.format_version = 0; + properties_.key_largest_seqno = 0; + properties_.key_smallest_seqno = 0; + properties_.column_family_id = topts.column_family_id; + properties_.column_family_name = topts.column_family_name; + properties_.db_id = topts.db_id; + properties_.db_session_id = topts.db_session_id; + properties_.db_host_id = topts.ioptions.db_host_id; + if (!ReifyDbHostIdProperty(topts.ioptions.env, &properties_.db_host_id) + .ok()) { + ROCKS_LOG_INFO(topts.ioptions.logger, + "db_host_id property will not be set"); + } + properties_.orig_file_number = topts.cur_file_num; + properties_.comparator_name = topts.ioptions.user_comparator != nullptr + ? topts.ioptions.user_comparator->Name() + : "nullptr"; + properties_.prefix_extractor_name = + topts.moptions.prefix_extractor != nullptr + ? topts.moptions.prefix_extractor->AsString() + : "nullptr"; + + for (auto& factory : *topts.internal_tbl_prop_coll_factories) { + assert(factory); + std::unique_ptr collector{ + factory->CreateInternalTblPropColl(topts.column_family_id, + topts.level_at_creation, + topts.ioptions.num_levels)}; + if (collector) { + table_properties_collectors_.emplace_back(std::move(collector)); + } + } + } + + void Add(const Slice& key, const Slice& value) override { + ParsedInternalKey pkey; + status_ = ParseInternalKey(key, &pkey, /*log_err_key=*/false); + if (status_.ok()) { + if (pkey.type != ValueType::kTypeValue) { + status_ = Status::NotSupported( + "Value type " + std::to_string(pkey.type) + "not supported"); + } else { + builder_->Add(pkey.user_key, value); + properties_.num_entries++; + properties_.raw_key_size += key.size(); + properties_.raw_value_size += value.size(); + NotifyCollectTableCollectorsOnAdd(key, value, /*file_size=*/0, + table_properties_collectors_, + ioptions_.logger); + } + } + } + + Status status() const override { + if (status_.ok()) { + return builder_->status(); + } else { + return status_; + } + } + + IOStatus io_status() const override { return status_to_io_status(status()); } + + Status Finish() override { + // Approximate the data size + properties_.data_size = + properties_.raw_key_size + properties_.raw_value_size; + + PropertyBlockBuilder property_block_builder; + property_block_builder.AddTableProperty(properties_); + UserCollectedProperties more_user_collected_properties; + NotifyCollectTableCollectorsOnFinish( + table_properties_collectors_, ioptions_.logger, &property_block_builder, + more_user_collected_properties, properties_.readable_properties); + properties_.user_collected_properties.insert( + more_user_collected_properties.begin(), + more_user_collected_properties.end()); + + Slice prop_block = property_block_builder.Finish(); + Status s = builder_->PutPropertiesBlock(prop_block); + if (s.ok() || s.IsNotSupported()) { + // If the builder doesn't support writing the properties block, + // we still call Finish() and let the external builder handle it. + s = builder_->Finish(); + } + + return s; + } + + void Abandon() override { builder_->Abandon(); } + + uint64_t FileSize() const override { return builder_->FileSize(); } + + uint64_t NumEntries() const override { return properties_.num_entries; } + + TableProperties GetTableProperties() const override { + return builder_->GetTableProperties(); + } + + std::string GetFileChecksum() const override { + return builder_->GetFileChecksum(); + } + + const char* GetFileChecksumFuncName() const override { + return builder_->GetFileChecksumFuncName(); + } + + private: + Status status_; + std::unique_ptr builder_; + std::unique_ptr file_; + const ImmutableOptions& ioptions_; + TableProperties properties_; + std::vector> + table_properties_collectors_; +}; + +class ExternalTableFactoryAdapter : public TableFactory { + public: + explicit ExternalTableFactoryAdapter( + std::shared_ptr inner) + : inner_(std::move(inner)) {} + + const char* Name() const override { return inner_->Name(); } + + using TableFactory::NewTableReader; + Status NewTableReader( + const ReadOptions& ro, const TableReaderOptions& topts, + std::unique_ptr&& file, uint64_t /* file_size */, + std::unique_ptr* table_reader, + bool /* prefetch_index_and_filter_in_cache */) const override { + // SstFileReader specifies largest_seqno as kMaxSequenceNumber to denote + // that its unknown + if (topts.largest_seqno > 0 && topts.largest_seqno != kMaxSequenceNumber) { + return Status::NotSupported( + "Ingesting file with sequence number larger than 0"); + } + std::unique_ptr reader; + FileOptions fopts(topts.env_options); + ExternalTableOptions ext_topts(topts.prefix_extractor, + topts.ioptions.user_comparator, + topts.ioptions.fs, fopts); + auto status = + inner_->NewTableReader(ro, file->file_name(), ext_topts, &reader); + if (!status.ok()) { + return status; + } + table_reader->reset( + new ExternalTableReaderAdapter(topts.ioptions, std::move(reader))); + file.reset(); + return Status::OK(); + } + + using TableFactory::NewTableBuilder; + TableBuilder* NewTableBuilder(const TableBuilderOptions& topts, + WritableFileWriter* file) const override { + std::unique_ptr builder; + ExternalTableBuilderOptions ext_topts( + topts.read_options, topts.write_options, + topts.moptions.prefix_extractor, topts.ioptions.user_comparator, + topts.column_family_name, topts.reason); + auto file_wrapper = + std::make_unique(file); + builder.reset(inner_->NewTableBuilder(ext_topts, file->file_name(), + file_wrapper.get())); + if (builder) { + return new ExternalTableBuilderAdapter(topts, std::move(builder), + std::move(file_wrapper)); + } + return nullptr; + } + + std::unique_ptr Clone() const override { return nullptr; } + + private: + // An FSWritableFile subclass for wrapping a WritableFileWriter. The + // latter is private to RocksDB, so we wrap it here in order to pass it + // to the ExternalTableBuilder. This is necessary for WritableFileWriter + // to intercept Append so that it can calculate the file checksum. + class ExternalTableWritableFileWrapper : public FSWritableFile { + public: + explicit ExternalTableWritableFileWrapper(WritableFileWriter* writer) + : writer_(writer) {} + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* /*dbg*/) override { + return writer_->Append(options, data); + } + + IOStatus Close(const IOOptions& options, IODebugContext* /*dbg*/) override { + return writer_->Close(options); + } + + IOStatus Flush(const IOOptions& options, IODebugContext* /*dbg*/) override { + return writer_->Flush(options); + } + + IOStatus Sync(const IOOptions& options, IODebugContext* /*dbg*/) override { + return writer_->Sync(options, /*use_fsync=*/false); + } + + uint64_t GetFileSize(const IOOptions& options, + IODebugContext* dbg) override { + return writer_->writable_file()->GetFileSize(options, dbg); + } + + private: + WritableFileWriter* writer_; + }; + + std::shared_ptr inner_; +}; + +} // namespace + +std::unique_ptr NewExternalTableFactory( + std::shared_ptr inner_factory) { + std::unique_ptr res; + res = std::make_unique(std::move(inner_factory)); + return res; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/table/external_table_reader.cc b/table/external_table_reader.cc deleted file mode 100644 index fdd0de0a0674..000000000000 --- a/table/external_table_reader.cc +++ /dev/null @@ -1,220 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include "rocksdb/external_table_reader.h" - -#include "rocksdb/table.h" -#include "table/internal_iterator.h" -#include "table/table_builder.h" -#include "table/table_reader.h" - -namespace ROCKSDB_NAMESPACE { - -namespace { - -class ExternalTableIterator : public InternalIterator { - public: - explicit ExternalTableIterator(Iterator* iterator) : iterator_(iterator) {} - - // No copying allowed - ExternalTableIterator(const ExternalTableIterator&) = delete; - ExternalTableIterator& operator=(const ExternalTableIterator&) = delete; - - ~ExternalTableIterator() override {} - - bool Valid() const override { return iterator_ && iterator_->Valid(); } - - void SeekToFirst() override { - status_ = Status::OK(); - if (iterator_) { - iterator_->SeekToFirst(); - UpdateKey(); - } - } - - void SeekToLast() override { - status_ = Status::OK(); - if (iterator_) { - iterator_->SeekToLast(); - UpdateKey(); - } - } - - void Seek(const Slice& target) override { - status_ = Status::OK(); - if (iterator_) { - ParsedInternalKey pkey; - status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false); - if (status_.ok()) { - iterator_->Seek(pkey.user_key); - UpdateKey(); - } - } - } - - void SeekForPrev(const Slice& target) override { - status_ = Status::OK(); - if (iterator_) { - ParsedInternalKey pkey; - status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false); - if (status_.ok()) { - iterator_->SeekForPrev(pkey.user_key); - UpdateKey(); - } - } - } - - void Next() override { - if (iterator_) { - iterator_->Next(); - UpdateKey(); - } - } - - void Prev() override { - if (iterator_) { - iterator_->Prev(); - UpdateKey(); - } - } - - Slice key() const override { - if (iterator_) { - return Slice(*key_.const_rep()); - } - return Slice(); - } - - Slice value() const override { - if (iterator_) { - return iterator_->value(); - } - return Slice(); - } - - Status status() const override { - return !status_.ok() ? status_ - : (iterator_ ? iterator_->status() : Status::OK()); - } - - private: - std::unique_ptr iterator_; - InternalKey key_; - Status status_; - - void UpdateKey() { key_.Set(iterator_->key(), 0, ValueType::kTypeValue); } -}; - -class ExternalTableReaderAdapter : public TableReader { - public: - explicit ExternalTableReaderAdapter( - std::unique_ptr reader) - : reader_(std::move(reader)) {} - - ~ExternalTableReaderAdapter() override {} - - // No copying allowed - ExternalTableReaderAdapter(const ExternalTableReaderAdapter&) = delete; - ExternalTableReaderAdapter& operator=(const ExternalTableReaderAdapter&) = - delete; - - InternalIterator* NewIterator( - const ReadOptions& read_options, const SliceTransform* prefix_extractor, - Arena* arena, bool /* skip_filters */, TableReaderCaller /* caller */, - size_t /* compaction_readahead_size */ = 0, - bool /* allow_unprepared_value */ = false) override { - auto iterator = reader_->NewIterator(read_options, prefix_extractor); - if (arena == nullptr) { - return new ExternalTableIterator(iterator); - } else { - auto* mem = arena->AllocateAligned(sizeof(ExternalTableIterator)); - return new (mem) ExternalTableIterator(iterator); - } - } - - uint64_t ApproximateOffsetOf(const ReadOptions&, const Slice&, - TableReaderCaller) override { - return 0; - } - - uint64_t ApproximateSize(const ReadOptions&, const Slice&, const Slice&, - TableReaderCaller) override { - return 0; - } - - void SetupForCompaction() override {} - - std::shared_ptr GetTableProperties() const override { - std::shared_ptr props = - std::make_shared(*reader_->GetTableProperties()); - props->key_largest_seqno = 0; - return props; - } - - size_t ApproximateMemoryUsage() const override { return 0; } - - Status Get(const ReadOptions&, const Slice&, GetContext*, - const SliceTransform*, bool = false) override { - return Status::NotSupported( - "Get() not supported on external file iterator"); - } - - virtual Status VerifyChecksum(const ReadOptions& /*ro*/, - TableReaderCaller /*caller*/) override { - return Status::OK(); - } - - private: - std::unique_ptr reader_; -}; - -class ExternalTableFactoryAdapter : public TableFactory { - public: - explicit ExternalTableFactoryAdapter( - std::shared_ptr inner) - : inner_(std::move(inner)) {} - - const char* Name() const override { return inner_->Name(); } - - using TableFactory::NewTableReader; - Status NewTableReader( - const ReadOptions& ro, const TableReaderOptions& topts, - std::unique_ptr&& file, uint64_t /* file_size */, - std::unique_ptr* table_reader, - bool /* prefetch_index_and_filter_in_cache */) const override { - std::unique_ptr reader; - ExternalTableOptions ext_topts(topts.prefix_extractor, - topts.ioptions.user_comparator); - auto status = - inner_->NewTableReader(ro, file->file_name(), ext_topts, &reader); - if (!status.ok()) { - return status; - } - table_reader->reset(new ExternalTableReaderAdapter(std::move(reader))); - file.reset(); - return Status::OK(); - } - - TableBuilder* NewTableBuilder(const TableBuilderOptions&, - WritableFileWriter*) const override { - return nullptr; - } - - std::unique_ptr Clone() const override { return nullptr; } - - private: - std::shared_ptr inner_; -}; - -} // namespace - -std::shared_ptr NewExternalTableFactory( - std::shared_ptr inner_factory) { - std::shared_ptr res; - res.reset(new ExternalTableFactoryAdapter(std::move(inner_factory))); - return res; -} - -} // namespace ROCKSDB_NAMESPACE diff --git a/table/format.cc b/table/format.cc index 46de42fbe9e2..d0f80009d442 100644 --- a/table/format.cc +++ b/table/format.cc @@ -154,23 +154,18 @@ std::string IndexValue::ToString(bool hex, bool have_first_key) const { namespace { inline bool IsLegacyFooterFormat(uint64_t magic_number) { - return magic_number == kLegacyBlockBasedTableMagicNumber || - magic_number == kLegacyPlainTableMagicNumber; + return magic_number == kLegacyPlainTableMagicNumber; } +// Used when reading format_version=0 footers (plain tables) inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { - if (magic_number == kLegacyBlockBasedTableMagicNumber) { - return kBlockBasedTableMagicNumber; - } if (magic_number == kLegacyPlainTableMagicNumber) { return kPlainTableMagicNumber; } assert(false); return magic_number; } +// Used by plain tables to write format_version=0 footers inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) { - if (magic_number == kBlockBasedTableMagicNumber) { - return kLegacyBlockBasedTableMagicNumber; - } if (magic_number == kPlainTableMagicNumber) { return kLegacyPlainTableMagicNumber; } @@ -178,14 +173,18 @@ inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) { return magic_number; } inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) { - if (magic_number == kBlockBasedTableMagicNumber || - magic_number == kLegacyBlockBasedTableMagicNumber) { + if (magic_number == kBlockBasedTableMagicNumber) { return static_cast(BlockBasedTable::kBlockTrailerSize); } else { return 0; } } +// NOTE: format_version 0 is still used by plain tables and format_version 1 by +// cuckoo table. For block-based tables, format_version < 2 is no longer +// supported for reading or writing. Legacy magic numbers on block-based tables +// are used only for good error reporting. +// // Footer format, in three parts: // * Part1 // -> format_version == 0 (inferred from legacy magic number) @@ -229,7 +228,8 @@ Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, const BlockHandle& index_handle, uint32_t base_context_checksum) { assert(magic_number != Footer::kNullTableMagicNumber); - assert(IsSupportedFormatVersion(format_version)); + assert(IsSupportedFormatVersionForWrite(magic_number, format_version) || + TEST_AllowUnsupportedFormatVersion()); char* part2; char* part3; @@ -250,6 +250,7 @@ Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, EncodeFixed64(cur, magic_number); assert(cur + 8 == slice_.data() + slice_.size()); } else { + // format_version == 0 is used by plain tables slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength); // Legacy SST files use kCRC32c checksum but it's not stored in footer. assert(checksum_type == kNoChecksum || checksum_type == kCRC32c); @@ -336,9 +337,18 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset, const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte; uint64_t magic = DecodeFixed64(magic_ptr); - // We check for legacy formats here and silently upconvert them + // Legacy block-based tables (format_version < 2) are no longer supported. + // (This constant is only used here and in the corresponding test.) + if (magic == 0xdb4775248b80fb57ull) { + return Status::NotSupported( + "Unsupported legacy magic number for block-based SST format. Load with " + "RocksDB >= 4.6.0 and < 11.0.0 and run full compaction to upgrade."); + } + + // Check for legacy formats bool legacy = IsLegacyFooterFormat(magic); if (legacy) { + // Legacy plain tables are still supported - upconvert magic magic = UpconvertLegacyFooterFormat(magic); } if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) { @@ -354,6 +364,7 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset, uint32_t computed_checksum = 0; uint64_t footer_offset = 0; if (legacy) { + // Legacy format (format_version=0, used by plain tables) // The size is already asserted to be at least kMinEncodedLength // at the beginning of the function input.remove_prefix(input.size() - kVersion0EncodedLength); @@ -362,9 +373,11 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset, } else { part3_ptr = magic_ptr - 4; format_version_ = DecodeFixed32(part3_ptr); - if (UNLIKELY(!IsSupportedFormatVersion(format_version_))) { - return Status::Corruption("Corrupt or unsupported format_version: " + - std::to_string(format_version_)); + if (UNLIKELY(!IsSupportedFormatVersionForRead(magic, format_version_) && + !TEST_AllowUnsupportedFormatVersion())) { + return Status::Corruption("Corrupt or unsupported format_version " + + std::to_string(format_version_) + + " for magic " + std::to_string(magic)); } // All known format versions >= 1 occupy exactly this many bytes. if (UNLIKELY(input.size() < kNewVersionsEncodedLength)) { @@ -475,15 +488,41 @@ std::string Footer::ToString() const { return result; } -static Status ReadFooterFromFileInternal(const IOOptions& opts, - RandomAccessFileReader* file, - FileSystem& fs, - FilePrefetchBuffer* prefetch_buffer, - uint64_t file_size, Footer* footer, - uint64_t enforce_table_magic_number) { - if (file_size < Footer::kMinEncodedLength) { +bool& TEST_AllowUnsupportedFormatVersion() { + static bool allow = false; + return allow; +} + +static Status ReadFooterFromFileInternal( + const IOOptions& opts, RandomAccessFileReader* file, FileSystem& fs, + FilePrefetchBuffer* prefetch_buffer, uint64_t expected_file_size, + Footer* footer, uint64_t enforce_table_magic_number) { + uint64_t file_size_from_file_system = 0; + Status s; + // Prefer the more efficient FSRandomAccessFile::GetFileSize when available + s = file->file()->GetFileSize(&file_size_from_file_system); + if (!s.ok()) { + // Fall back on FileSystem::GetFileSize on failure + s = fs.GetFileSize(file->file_name(), IOOptions(), + &file_size_from_file_system, nullptr); + if (!s.ok()) { + return s; + } + } + + if (expected_file_size != file_size_from_file_system) { + // When file is opened during DB Open, the expected file size is from + // manifest. Otherwise it is not guaranteed. + return Status::Corruption("Sst file size mismatch between expected " + + std::to_string(expected_file_size) + + " and file system " + + std::to_string(file_size_from_file_system) + + " sstable: " + file->file_name()); + } + + if (expected_file_size < Footer::kMinEncodedLength) { return Status::Corruption("file is too short (" + - std::to_string(file_size) + + std::to_string(expected_file_size) + " bytes) to be an " "sstable: " + file->file_name()); @@ -492,10 +531,9 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts, std::array footer_buf; AlignedBuf internal_buf; Slice footer_input; - uint64_t read_offset = (file_size > Footer::kMaxEncodedLength) - ? file_size - Footer::kMaxEncodedLength + uint64_t read_offset = (expected_file_size > Footer::kMaxEncodedLength) + ? expected_file_size - Footer::kMaxEncodedLength : 0; - Status s; // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now, // there is no readahead for point lookups, so TryReadFromCache will fail if // the required data is not in the prefetch buffer. Once deadline is enabled @@ -520,23 +558,14 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts, TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input); - // Check that we actually read the whole footer from the file. It may be - // that size isn't correct. + // Check that we actually read the whole footer from the file. if (footer_input.size() < Footer::kMinEncodedLength) { - uint64_t size_on_disk = 0; - if (fs.GetFileSize(file->file_name(), IOOptions(), &size_on_disk, nullptr) - .ok()) { - // Similar to CheckConsistency message, but not completely sure the - // expected size always came from manifest. - return Status::Corruption("Sst file size mismatch: " + file->file_name() + - ". Expected " + std::to_string(file_size) + - ", actual size " + - std::to_string(size_on_disk) + "\n"); - } else { - return Status::Corruption( - "Missing SST footer data in file " + file->file_name() + - " File too short? Expected size: " + std::to_string(file_size)); - } + return Status::Corruption( + "The number of bytes read for Footer input " + + std::to_string(footer_input.size()) + + " is smaller than minimum footer encoded length: " + + std::to_string(Footer::kMinEncodedLength) + " for file " + + file->file_name() + "\n"); } s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number); @@ -549,20 +578,21 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts, Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, FileSystem& fs, FilePrefetchBuffer* prefetch_buffer, - uint64_t file_size, Footer* footer, + uint64_t expected_file_size, Footer* footer, uint64_t enforce_table_magic_number, Statistics* stats) { - Status s = - ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer, file_size, - footer, enforce_table_magic_number); + Status s = ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer, + expected_file_size, footer, + enforce_table_magic_number); if (s.IsCorruption() && CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) { IOOptions new_opts = opts; new_opts.verify_and_reconstruct_read = true; footer->Reset(); s = ReadFooterFromFileInternal(new_opts, file, fs, - /*prefetch_buffer=*/nullptr, file_size, - footer, enforce_table_magic_number); + /*prefetch_buffer=*/nullptr, + expected_file_size, footer, + enforce_table_magic_number); RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT); if (s.ok()) { RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT); @@ -653,70 +683,81 @@ uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data, } } -Status UncompressBlockData(const UncompressionInfo& uncompression_info, - const char* data, size_t size, - BlockContents* out_contents, uint32_t format_version, +Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor, + BlockContents* out_contents, const ImmutableOptions& ioptions, MemoryAllocator* allocator) { - Status ret = Status::OK(); - - assert(uncompression_info.type() != kNoCompression && - "Invalid compression type"); + assert(args.compression_type != kNoCompression && "Invalid compression type"); StopWatchNano timer(ioptions.clock, ShouldReportDetailedTime(ioptions.env, ioptions.stats)); - size_t uncompressed_size = 0; - const char* error_msg = nullptr; - CacheAllocationPtr ubuf = UncompressData( - uncompression_info, data, size, &uncompressed_size, - GetCompressFormatForVersion(format_version), allocator, &error_msg); - if (!ubuf) { - if (!CompressionTypeSupported(uncompression_info.type())) { - ret = Status::NotSupported( - "Unsupported compression method for this build", - CompressionTypeToString(uncompression_info.type())); - } else { - std::ostringstream oss; - oss << "Corrupted compressed block contents"; - if (error_msg) { - oss << ": " << error_msg; - } - ret = Status::Corruption( - oss.str(), CompressionTypeToString(uncompression_info.type())); - } - return ret; + + Status s = decompressor.ExtractUncompressedSize(args); + if (UNLIKELY(!s.ok())) { + return s; + } + CacheAllocationPtr ubuf = AllocateBlock(args.uncompressed_size, allocator); + s = decompressor.DecompressBlock(args, ubuf.get()); + if (UNLIKELY(!s.ok())) { + return s; } - *out_contents = BlockContents(std::move(ubuf), uncompressed_size); + *out_contents = BlockContents(std::move(ubuf), args.uncompressed_size); if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) { RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS, timer.ElapsedNanos()); } - RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM, size); + RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM, + args.compressed_data.size()); RecordTick(ioptions.stats, BYTES_DECOMPRESSED_TO, out_contents->data.size()); RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED); - TEST_SYNC_POINT_CALLBACK("UncompressBlockData:TamperWithReturnValue", - static_cast(&ret)); - TEST_SYNC_POINT_CALLBACK( - "UncompressBlockData:" - "TamperWithDecompressionOutput", - static_cast(out_contents)); + TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithReturnValue", + static_cast(&s)); + TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithDecompressionOutput", + static_cast(out_contents)); - return ret; + return s; } -Status UncompressSerializedBlock(const UncompressionInfo& uncompression_info, - const char* data, size_t size, +Status DecompressBlockData(const char* data, size_t size, CompressionType type, + Decompressor& decompressor, + BlockContents* out_contents, + const ImmutableOptions& ioptions, + MemoryAllocator* allocator, + Decompressor::ManagedWorkingArea* working_area) { + Decompressor::Args args; + args.compressed_data = Slice(data, size); + args.compression_type = type; + args.working_area = working_area; + return DecompressBlockData(args, decompressor, out_contents, ioptions, + allocator); +} + +Status DecompressSerializedBlock(const char* data, size_t size, + CompressionType type, + Decompressor& decompressor, BlockContents* out_contents, - uint32_t format_version, const ImmutableOptions& ioptions, MemoryAllocator* allocator) { assert(data[size] != kNoCompression); - assert(data[size] == static_cast(uncompression_info.type())); - return UncompressBlockData(uncompression_info, data, size, out_contents, - format_version, ioptions, allocator); + assert(data[size] == static_cast(type)); + return DecompressBlockData(data, size, type, decompressor, out_contents, + ioptions, allocator); +} + +Status DecompressSerializedBlock(Decompressor::Args& args, + Decompressor& decompressor, + BlockContents* out_contents, + const ImmutableOptions& ioptions, + MemoryAllocator* allocator) { + assert(args.compressed_data.data()[args.compressed_data.size()] != + kNoCompression); + assert(args.compressed_data.data()[args.compressed_data.size()] == + static_cast(args.compression_type)); + return DecompressBlockData(args, decompressor, out_contents, ioptions, + allocator); } // Replace the contents of db_host_id with the actual hostname, if db_host_id diff --git a/table/format.h b/table/format.h index dac5d695be45..be7c0fa8abff 100644 --- a/table/format.h +++ b/table/format.h @@ -34,7 +34,6 @@ bool ShouldReportDetailedTime(Env* env, Statistics* stats); // the length of the magic number in bytes. constexpr uint32_t kMagicNumberLengthByte = 8; -extern const uint64_t kLegacyBlockBasedTableMagicNumber; extern const uint64_t kBlockBasedTableMagicNumber; extern const uint64_t kLegacyPlainTableMagicNumber; @@ -55,7 +54,7 @@ class BlockHandle { uint64_t offset() const { return offset_; } void set_offset(uint64_t _offset) { offset_ = _offset; } - // The size of the stored block + // The size of the stored block, this size does not include the block trailer. uint64_t size() const { return size_; } void set_size(uint64_t _size) { size_ = _size; } @@ -90,6 +89,16 @@ class BlockHandle { static const BlockHandle kNullBlockHandle; }; +struct EncodedBlockHandle { + explicit EncodedBlockHandle(const BlockHandle& h) { + auto end = h.EncodeTo(buffer.data()); + size = end - buffer.data(); + } + Slice AsSlice() const { return Slice(buffer.data(), size); } + std::array buffer; + size_t size; +}; + // Value in block-based table file index. // // The index entry for block n is: y -> h, [x], @@ -153,17 +162,49 @@ inline uint32_t ChecksumModifierForContext(uint32_t base_context_checksum, return modifier & all_or_nothing; } -inline uint32_t GetCompressFormatForVersion(uint32_t format_version) { - // As of format_version 2, we encode compressed block with - // compress_format_version == 2. Before that, the version is 1. - // DO NOT CHANGE THIS FUNCTION, it affects disk format - return format_version >= 2 ? 2 : 1; -} +constexpr uint32_t kLatestBbtFormatVersion = 7; -constexpr uint32_t kLatestFormatVersion = 6; +// Minimum format version supported for reading SST files in block-based format. +// +// When phasing out old format versions, first increase the write minimum, +// then later (>= 6 mo) increase the read minimum when removing the +// implementation for both read and write. +constexpr uint32_t kMinSupportedBbtFormatVersionForRead = 2; + +// Minimum format version supported for writing new SST files in block-based +// format. This should be >= kMinSupportedFormatVersionForRead. +// +// When phasing out old format versions, first increase the write minimum, +// then later (>= 6 mo) increase the read minimum when removing the +// implementation for both read and write. +constexpr uint32_t kMinSupportedBbtFormatVersionForWrite = 2; +static_assert(kMinSupportedBbtFormatVersionForWrite >= + kMinSupportedBbtFormatVersionForRead); + +inline bool IsSupportedFormatVersionForRead(uint64_t magic, uint32_t version) { + if (magic == kBlockBasedTableMagicNumber) { + return version >= kMinSupportedBbtFormatVersionForRead && + version <= kLatestBbtFormatVersion; + } else if (magic == kPlainTableMagicNumber) { + return version == 0; + } else if (magic == kCuckooTableMagicNumber) { + return version == 1; + } else { + return false; + } +} -inline bool IsSupportedFormatVersion(uint32_t version) { - return version <= kLatestFormatVersion; +inline bool IsSupportedFormatVersionForWrite(uint64_t magic, uint32_t version) { + if (magic == kBlockBasedTableMagicNumber) { + return version >= kMinSupportedBbtFormatVersionForWrite && + version <= kLatestBbtFormatVersion; + } else if (magic == kPlainTableMagicNumber) { + return version == 0; + } else if (magic == kCuckooTableMagicNumber) { + return version == 1; + } else { + return false; + } } // Same as having a unique id in footer. @@ -175,6 +216,10 @@ inline bool FormatVersionUsesIndexHandleInFooter(uint32_t version) { return version < 6; } +inline bool FormatVersionUsesCompressionManagerName(uint32_t version) { + return version >= 7; +} + // Footer encapsulates the fixed information stored at the tail end of every // SST file. In general, it should only include things that cannot go // elsewhere under the metaindex block. For example, checksum_type is @@ -308,6 +353,10 @@ class FooterBuilder { std::array data_; }; +// Set to true to allow unit testing of writing unsupported block-based table +// format versions (to test read side) +bool& TEST_AllowUnsupportedFormatVersion(); + // Read the footer from file // If enforce_table_magic_number != 0, ReadFooterFromFile() will return // corruption if table_magic number is not equal to enforce_table_magic_number @@ -382,6 +431,7 @@ struct BlockContents { // The additional memory space taken by the block data. size_t usable_size() const { + // FIXME: doesn't account for possible block trailer if (allocation.get() != nullptr) { auto allocator = allocation.get_deleter().allocator; if (allocator) { @@ -416,21 +466,30 @@ struct BlockContents { // The `data` points to serialized block contents read in from file, which // must be compressed and include a trailer beyond `size`. A new buffer is // allocated with the given allocator (or default) and the uncompressed -// contents are returned in `out_contents`. -// format_version is as defined in include/rocksdb/table.h, which is -// used to determine compression format version. -Status UncompressSerializedBlock(const UncompressionInfo& info, - const char* data, size_t size, +// contents are returned in `out_contents`. Statistics updated. +Status DecompressSerializedBlock(const char* data, size_t size, + CompressionType type, + Decompressor& decompressor, BlockContents* out_contents, - uint32_t format_version, const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr); -// This is a variant of UncompressSerializedBlock that does not expect a -// block trailer beyond `size`. (CompressionType is taken from `info`.) -Status UncompressBlockData(const UncompressionInfo& info, const char* data, - size_t size, BlockContents* out_contents, - uint32_t format_version, +Status DecompressSerializedBlock(Decompressor::Args& args, + Decompressor& decompressor, + BlockContents* out_contents, + const ImmutableOptions& ioptions, + MemoryAllocator* allocator = nullptr); + +// This is a variant of DecompressSerializedBlock that does not expect a +// block trailer beyond `size`. (CompressionType is passed in.) +Status DecompressBlockData( + const char* data, size_t size, CompressionType type, + Decompressor& decompressor, BlockContents* out_contents, + const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr, + Decompressor::ManagedWorkingArea* working_area = nullptr); + +Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor, + BlockContents* out_contents, const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr); diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 8ecbb0f90b4f..b385ef55a2c0 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -10,6 +10,7 @@ #include "db/dbformat.h" #include "file/readahead_file_info.h" +#include "rocksdb/advanced_iterator.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/status.h" @@ -19,19 +20,6 @@ namespace ROCKSDB_NAMESPACE { class PinnedIteratorsManager; -enum class IterBoundCheck : char { - kUnknown = 0, - kOutOfBound, - kInbound, -}; - -struct IterateResult { - Slice key; - IterBoundCheck bound_check_result = IterBoundCheck::kUnknown; - // If false, PrepareValue() needs to be called before value(). - bool value_prepared = true; -}; - template class InternalIteratorBase : public Cleanable { public: @@ -212,6 +200,8 @@ class InternalIteratorBase : public Cleanable { // used by MergingIterator and LevelIterator for now. virtual bool IsDeleteRangeSentinelKey() const { return false; } + virtual void Prepare(const MultiScanArgs* /*scan_opts*/) {} + protected: void SeekForPrevImpl(const Slice& target, const CompareInterface* cmp) { Seek(target); diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index b53076910ec6..b585aaa4a7e0 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -195,6 +195,14 @@ class IteratorWrapperBase { return iter_->IsDeleteRangeSentinelKey(); } + // scan_opts lifetime is guaranteed until the iterator is destructed, or + // Prepare() is called with a new scan_opts + void Prepare(const MultiScanArgs* scan_opts) { + if (iter_) { + iter_->Prepare(scan_opts); + } + } + private: void Update() { valid_ = iter_->Valid(); diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 375c811c59fc..e27f4c6fa270 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -482,6 +482,12 @@ class MergingIterator : public InternalIterator { current_->IsValuePinned(); } + void Prepare(const MultiScanArgs* scan_opts) override { + for (auto& child : children_) { + child.iter.Prepare(scan_opts); + } + } + private: // Represents an element in the min/max heap. Each HeapItem corresponds to a // point iterator or a range tombstone iterator, differentiated by diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 7d6ab76e294c..72ee79266af6 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -29,8 +29,6 @@ namespace ROCKSDB_NAMESPACE { const std::string kPropertiesBlockName = "rocksdb.properties"; // NB: only used with format_version >= 6 const std::string kIndexBlockName = "rocksdb.index"; -// Old property block name for backward compatibility -const std::string kPropertiesBlockOldName = "rocksdb.stats"; const std::string kCompressionDictBlockName = "rocksdb.compression_dict"; const std::string kRangeDelBlockName = "rocksdb.range_del"; @@ -167,6 +165,9 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { if (props.key_largest_seqno != UINT64_MAX) { Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno); } + if (props.key_smallest_seqno != UINT64_MAX) { + Add(TablePropertiesNames::kKeySmallestSeqno, props.key_smallest_seqno); + } } Slice PropertyBlockBuilder::Finish() { @@ -253,6 +254,146 @@ bool NotifyCollectTableCollectorsOnFinish( return all_succeeded; } +Status ParsePropertiesBlock( + const ImmutableOptions& ioptions, uint64_t offset, Block& properties_block, + std::unique_ptr& new_table_properties) { + std::unique_ptr iter(properties_block.NewMetaIterator()); + + // All pre-defined properties of type uint64_t + std::unordered_map predefined_uint64_properties = { + {TablePropertiesNames::kOriginalFileNumber, + &new_table_properties->orig_file_number}, + {TablePropertiesNames::kDataSize, &new_table_properties->data_size}, + {TablePropertiesNames::kIndexSize, &new_table_properties->index_size}, + {TablePropertiesNames::kIndexPartitions, + &new_table_properties->index_partitions}, + {TablePropertiesNames::kTopLevelIndexSize, + &new_table_properties->top_level_index_size}, + {TablePropertiesNames::kIndexKeyIsUserKey, + &new_table_properties->index_key_is_user_key}, + {TablePropertiesNames::kIndexValueIsDeltaEncoded, + &new_table_properties->index_value_is_delta_encoded}, + {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, + {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, + {TablePropertiesNames::kRawValueSize, + &new_table_properties->raw_value_size}, + {TablePropertiesNames::kNumDataBlocks, + &new_table_properties->num_data_blocks}, + {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kNumFilterEntries, + &new_table_properties->num_filter_entries}, + {TablePropertiesNames::kDeletedKeys, + &new_table_properties->num_deletions}, + {TablePropertiesNames::kMergeOperands, + &new_table_properties->num_merge_operands}, + {TablePropertiesNames::kNumRangeDeletions, + &new_table_properties->num_range_deletions}, + {TablePropertiesNames::kFormatVersion, + &new_table_properties->format_version}, + {TablePropertiesNames::kFixedKeyLen, + &new_table_properties->fixed_key_len}, + {TablePropertiesNames::kColumnFamilyId, + &new_table_properties->column_family_id}, + {TablePropertiesNames::kCreationTime, + &new_table_properties->creation_time}, + {TablePropertiesNames::kOldestKeyTime, + &new_table_properties->oldest_key_time}, + {TablePropertiesNames::kNewestKeyTime, + &new_table_properties->newest_key_time}, + {TablePropertiesNames::kFileCreationTime, + &new_table_properties->file_creation_time}, + {TablePropertiesNames::kSlowCompressionEstimatedDataSize, + &new_table_properties->slow_compression_estimated_data_size}, + {TablePropertiesNames::kFastCompressionEstimatedDataSize, + &new_table_properties->fast_compression_estimated_data_size}, + {TablePropertiesNames::kTailStartOffset, + &new_table_properties->tail_start_offset}, + {TablePropertiesNames::kUserDefinedTimestampsPersisted, + &new_table_properties->user_defined_timestamps_persisted}, + {TablePropertiesNames::kKeyLargestSeqno, + &new_table_properties->key_largest_seqno}, + {TablePropertiesNames::kKeySmallestSeqno, + &new_table_properties->key_smallest_seqno}, + }; + + Status s; + std::string last_key; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = iter->status(); + if (!s.ok()) { + break; + } + + auto key = iter->key().ToString(); + // properties block should be strictly sorted with no duplicate key. + if (!last_key.empty() && + BytewiseComparator()->Compare(key, last_key) <= 0) { + s = Status::Corruption("properties unsorted"); + break; + } + last_key = key; + + auto raw_val = iter->value(); + auto pos = predefined_uint64_properties.find(key); + + if (key == ExternalSstFilePropertyNames::kGlobalSeqno) { + new_table_properties->external_sst_file_global_seqno_offset = + offset + iter->ValueOffset(); + } + + if (pos != predefined_uint64_properties.end()) { + if (key == TablePropertiesNames::kDeletedKeys || + key == TablePropertiesNames::kMergeOperands) { + // Insert in user-collected properties for API backwards compatibility + new_table_properties->user_collected_properties.insert( + {key, raw_val.ToString()}); + } + // handle predefined rocksdb properties + uint64_t val; + if (!GetVarint64(&raw_val, &val)) { + // skip malformed value + auto error_msg = + "Detect malformed value in properties meta-block:" + "\tkey: " + + key + "\tval: " + raw_val.ToString(); + ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str()); + continue; + } + *(pos->second) = val; + } else if (key == TablePropertiesNames::kDbId) { + new_table_properties->db_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kDbSessionId) { + new_table_properties->db_session_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kDbHostId) { + new_table_properties->db_host_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kFilterPolicy) { + new_table_properties->filter_policy_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kColumnFamilyName) { + new_table_properties->column_family_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kComparator) { + new_table_properties->comparator_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kMergeOperator) { + new_table_properties->merge_operator_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kPrefixExtractorName) { + new_table_properties->prefix_extractor_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kPropertyCollectors) { + new_table_properties->property_collectors_names = raw_val.ToString(); + } else if (key == TablePropertiesNames::kCompression) { + new_table_properties->compression_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kCompressionOptions) { + new_table_properties->compression_options = raw_val.ToString(); + } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) { + new_table_properties->seqno_to_time_mapping = raw_val.ToString(); + } else { + // handle user-collected properties + new_table_properties->user_collected_properties.insert( + {key, raw_val.ToString()}); + } + } + + return s; +} + // FIXME: should be a parameter for reading table properties to use persistent // cache? Status ReadTablePropertiesHelper( @@ -282,7 +423,7 @@ Status ReadTablePropertiesHelper( BlockFetcher block_fetcher( file, prefetch_buffer, footer, modified_ro, handle, &block_contents, ioptions, false /* decompress */, false /*maybe_compressed*/, - BlockType::kProperties, UncompressionDict::GetEmptyDict(), + BlockType::kProperties, nullptr /*decompressor*/, PersistentCacheOptions::kEmpty, memory_allocator); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { @@ -296,15 +437,16 @@ Status ReadTablePropertiesHelper( // If retrying, use a stronger file system read to check and correct // data corruption IOOptions opts; - if (PrepareIOFromReadOptions(ro, ioptions.clock, opts) != + IODebugContext dbg; + if (PrepareIOFromReadOptions(ro, ioptions.clock, opts, &dbg) != IOStatus::OK()) { return s; } opts.verify_and_reconstruct_read = true; std::unique_ptr data(new char[len]); Slice result; - IOStatus io_s = - file->Read(opts, handle.offset(), len, &result, data.get(), nullptr); + IOStatus io_s = file->Read(opts, handle.offset(), len, &result, + data.get(), nullptr, &dbg); RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT); if (!io_s.ok()) { ROCKS_LOG_INFO(ioptions.info_log, @@ -324,146 +466,16 @@ Status ReadTablePropertiesHelper( uint64_t block_size = block_contents.data.size(); Block properties_block(std::move(block_contents)); - // Unfortunately, Block::size() might not equal block_contents.data.size(), - // and Block hides block_contents - std::unique_ptr iter(properties_block.NewMetaIterator()); - std::unique_ptr new_table_properties{new TableProperties}; - // All pre-defined properties of type uint64_t - std::unordered_map predefined_uint64_properties = { - {TablePropertiesNames::kOriginalFileNumber, - &new_table_properties->orig_file_number}, - {TablePropertiesNames::kDataSize, &new_table_properties->data_size}, - {TablePropertiesNames::kIndexSize, &new_table_properties->index_size}, - {TablePropertiesNames::kIndexPartitions, - &new_table_properties->index_partitions}, - {TablePropertiesNames::kTopLevelIndexSize, - &new_table_properties->top_level_index_size}, - {TablePropertiesNames::kIndexKeyIsUserKey, - &new_table_properties->index_key_is_user_key}, - {TablePropertiesNames::kIndexValueIsDeltaEncoded, - &new_table_properties->index_value_is_delta_encoded}, - {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, - {TablePropertiesNames::kRawKeySize, - &new_table_properties->raw_key_size}, - {TablePropertiesNames::kRawValueSize, - &new_table_properties->raw_value_size}, - {TablePropertiesNames::kNumDataBlocks, - &new_table_properties->num_data_blocks}, - {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, - {TablePropertiesNames::kNumFilterEntries, - &new_table_properties->num_filter_entries}, - {TablePropertiesNames::kDeletedKeys, - &new_table_properties->num_deletions}, - {TablePropertiesNames::kMergeOperands, - &new_table_properties->num_merge_operands}, - {TablePropertiesNames::kNumRangeDeletions, - &new_table_properties->num_range_deletions}, - {TablePropertiesNames::kFormatVersion, - &new_table_properties->format_version}, - {TablePropertiesNames::kFixedKeyLen, - &new_table_properties->fixed_key_len}, - {TablePropertiesNames::kColumnFamilyId, - &new_table_properties->column_family_id}, - {TablePropertiesNames::kCreationTime, - &new_table_properties->creation_time}, - {TablePropertiesNames::kOldestKeyTime, - &new_table_properties->oldest_key_time}, - {TablePropertiesNames::kNewestKeyTime, - &new_table_properties->newest_key_time}, - {TablePropertiesNames::kFileCreationTime, - &new_table_properties->file_creation_time}, - {TablePropertiesNames::kSlowCompressionEstimatedDataSize, - &new_table_properties->slow_compression_estimated_data_size}, - {TablePropertiesNames::kFastCompressionEstimatedDataSize, - &new_table_properties->fast_compression_estimated_data_size}, - {TablePropertiesNames::kTailStartOffset, - &new_table_properties->tail_start_offset}, - {TablePropertiesNames::kUserDefinedTimestampsPersisted, - &new_table_properties->user_defined_timestamps_persisted}, - {TablePropertiesNames::kKeyLargestSeqno, - &new_table_properties->key_largest_seqno}, - }; - - std::string last_key; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - s = iter->status(); - if (!s.ok()) { - break; - } - - auto key = iter->key().ToString(); - // properties block should be strictly sorted with no duplicate key. - if (!last_key.empty() && - BytewiseComparator()->Compare(key, last_key) <= 0) { - s = Status::Corruption("properties unsorted"); - break; - } - last_key = key; - - auto raw_val = iter->value(); - auto pos = predefined_uint64_properties.find(key); - - if (key == ExternalSstFilePropertyNames::kGlobalSeqno) { - new_table_properties->external_sst_file_global_seqno_offset = - handle.offset() + iter->ValueOffset(); - } - - if (pos != predefined_uint64_properties.end()) { - if (key == TablePropertiesNames::kDeletedKeys || - key == TablePropertiesNames::kMergeOperands) { - // Insert in user-collected properties for API backwards compatibility - new_table_properties->user_collected_properties.insert( - {key, raw_val.ToString()}); - } - // handle predefined rocksdb properties - uint64_t val; - if (!GetVarint64(&raw_val, &val)) { - // skip malformed value - auto error_msg = - "Detect malformed value in properties meta-block:" - "\tkey: " + - key + "\tval: " + raw_val.ToString(); - ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str()); - continue; - } - *(pos->second) = val; - } else if (key == TablePropertiesNames::kDbId) { - new_table_properties->db_id = raw_val.ToString(); - } else if (key == TablePropertiesNames::kDbSessionId) { - new_table_properties->db_session_id = raw_val.ToString(); - } else if (key == TablePropertiesNames::kDbHostId) { - new_table_properties->db_host_id = raw_val.ToString(); - } else if (key == TablePropertiesNames::kFilterPolicy) { - new_table_properties->filter_policy_name = raw_val.ToString(); - } else if (key == TablePropertiesNames::kColumnFamilyName) { - new_table_properties->column_family_name = raw_val.ToString(); - } else if (key == TablePropertiesNames::kComparator) { - new_table_properties->comparator_name = raw_val.ToString(); - } else if (key == TablePropertiesNames::kMergeOperator) { - new_table_properties->merge_operator_name = raw_val.ToString(); - } else if (key == TablePropertiesNames::kPrefixExtractorName) { - new_table_properties->prefix_extractor_name = raw_val.ToString(); - } else if (key == TablePropertiesNames::kPropertyCollectors) { - new_table_properties->property_collectors_names = raw_val.ToString(); - } else if (key == TablePropertiesNames::kCompression) { - new_table_properties->compression_name = raw_val.ToString(); - } else if (key == TablePropertiesNames::kCompressionOptions) { - new_table_properties->compression_options = raw_val.ToString(); - } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) { - new_table_properties->seqno_to_time_mapping = raw_val.ToString(); - } else { - // handle user-collected properties - new_table_properties->user_collected_properties.insert( - {key, raw_val.ToString()}); - } - } + s = ParsePropertiesBlock(ioptions, handle.offset(), properties_block, + new_table_properties); // Modified version of BlockFetcher checksum verification // (See write_global_seqno comment above) if (s.ok() && footer.GetBlockTrailerSize() > 0) { s = VerifyBlockChecksum(footer, properties_block.data(), block_size, - file->file_name(), handle.offset()); + file->file_name(), handle.offset(), + BlockType::kProperties); if (s.IsCorruption()) { if (new_table_properties->external_sst_file_global_seqno_offset != 0) { std::string tmp_buf(properties_block.data(), len); @@ -472,7 +484,8 @@ Status ReadTablePropertiesHelper( handle.offset(); EncodeFixed64(&tmp_buf[static_cast(global_seqno_offset)], 0); s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size, - file->file_name(), handle.offset()); + file->file_name(), handle.offset(), + BlockType::kProperties); } } } @@ -530,14 +543,6 @@ Status FindOptionalMetaBlock(InternalIterator* meta_index_iter, if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) { Slice v = meta_index_iter->value(); return block_handle->DecodeFrom(&v); - } else if (meta_block_name == kPropertiesBlockName) { - // Have to try old name for compatibility - meta_index_iter->Seek(kPropertiesBlockOldName); - if (meta_index_iter->status().ok() && meta_index_iter->Valid() && - meta_index_iter->key() == kPropertiesBlockOldName) { - Slice v = meta_index_iter->value(); - return block_handle->DecodeFrom(&v); - } } } // else @@ -567,8 +572,9 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, Footer* footer_out) { Footer footer; IOOptions opts; + IODebugContext dbg; Status s; - s = file->PrepareIOOptions(read_options, opts); + s = file->PrepareIOOptions(read_options, opts, &dbg); if (!s.ok()) { return s; } @@ -585,7 +591,7 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, return BlockFetcher(file, prefetch_buffer, footer, read_options, metaindex_handle, metaindex_contents, ioptions, false /* do decompression */, false /*maybe_compressed*/, - BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), + BlockType::kMetaIndex, nullptr /*decompressor*/, PersistentCacheOptions::kEmpty, memory_allocator) .ReadBlockContents(); } @@ -638,8 +644,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file, return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle, contents, ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, - UncompressionDict::GetEmptyDict(), - PersistentCacheOptions::kEmpty, memory_allocator) + nullptr /*decompressor*/, PersistentCacheOptions::kEmpty, + memory_allocator) .ReadBlockContents(); } diff --git a/table/meta_blocks.h b/table/meta_blocks.h index a6aacdf5030a..0012e9c305fc 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -22,6 +22,7 @@ namespace ROCKSDB_NAMESPACE { +class Block; class BlockBuilder; class BlockHandle; class Env; @@ -33,7 +34,6 @@ struct TableProperties; // Meta block names for metaindex extern const std::string kPropertiesBlockName; extern const std::string kIndexBlockName; -extern const std::string kPropertiesBlockOldName; extern const std::string kCompressionDictBlockName; extern const std::string kRangeDelBlockName; @@ -110,6 +110,10 @@ bool NotifyCollectTableCollectorsOnFinish( UserCollectedProperties& user_collected_properties, UserCollectedProperties& readable_properties); +Status ParsePropertiesBlock( + const ImmutableOptions& ioptions, uint64_t offset, Block& block, + std::unique_ptr& new_table_properties); + // Read table properties from a file using known BlockHandle. // @returns a status to indicate if the operation succeeded. On success, // *table_properties will point to a heap-allocated TableProperties diff --git a/table/multiget_context.h b/table/multiget_context.h index a82c08aabe3c..c42b3b2c1869 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -129,7 +129,9 @@ class MultiGetContext { lookup_key_ptr_ = reinterpret_cast(lookup_key_heap_buf.get()); } - for (size_t iter = 0; iter != num_keys_; ++iter) { + for (size_t iter = 0; + iter < num_keys_ && /* suppress a warning */ iter < MAX_BATCH_SIZE; + ++iter) { // autovector may not be contiguous storage, so make a copy sorted_keys_[iter] = (*sorted_keys)[begin + iter]; sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter]) @@ -219,7 +221,9 @@ class MultiGetContext { while (++index_ < range_->end_ && (Mask{1} << index_) & (range_->ctx_->value_mask_ | range_->skip_mask_ | - range_->invalid_mask_)); + range_->invalid_mask_)) { + // empty loop body + } return *this; } diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index 541b4a5b768a..9c4f87553774 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -151,6 +151,14 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { return; } +#ifndef NDEBUG + bool skip = false; + TEST_SYNC_POINT_CALLBACK("PlainTableBuilder::Add::skip", (void*)&skip); + if (skip) { + return; + } +#endif // !NDEBUG + // Store key hash if (store_index_in_file_) { if (moptions_.prefix_extractor == nullptr) { diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 578e92aa3126..b90f24da6898 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -120,7 +120,9 @@ Status PlainTableReader::Open( bool full_scan_mode, const bool immortal_table, const SliceTransform* prefix_extractor) { if (file_size > PlainTableIndex::kMaxFileSize) { - return Status::NotSupported("File is too large for PlainTableReader!"); + return Status::NotSupported("File size " + std::to_string(file_size) + + " exceeds PlainTableReader max file size " + + std::to_string(PlainTableIndex::kMaxFileSize)); } std::unique_ptr props; diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index 905eef7004a7..a4b235546559 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -23,6 +23,7 @@ #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" #include "rocksdb/iterator.h" #include "rocksdb/slice_transform.h" #include "rocksdb/status.h" @@ -47,12 +48,13 @@ SstFileDumper::SstFileDumper(const Options& options, Temperature file_temp, size_t readahead_size, bool verify_checksum, bool output_hex, bool decode_blob_index, const EnvOptions& soptions, - bool silent) + bool silent, bool show_sequence_number_type) : file_name_(file_path), read_num_(0), file_temp_(file_temp), output_hex_(output_hex), decode_blob_index_(decode_blob_index), + show_sequence_number_type_(show_sequence_number_type), soptions_(soptions), silent_(silent), options_(options), @@ -84,6 +86,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { uint64_t file_size = 0; FileOptions fopts = soptions_; fopts.temperature = file_temp_; + fopts.file_checksum_func_name = kNoFileChecksumFuncName; Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); if (s.ok()) { // check empty file @@ -128,18 +131,18 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { if (magic_number == kCuckooTableMagicNumber) { fopts = soptions_; fopts.temperature = file_temp_; + fopts.file_checksum_func_name = kNoFileChecksumFuncName; } fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); file_.reset(new RandomAccessFileReader(std::move(file), file_path)); } - // For old sst format, ReadTableProperties might fail but file can be read - if (ReadTableProperties(magic_number, file_.get(), file_size, + s = ReadTableProperties(magic_number, file_.get(), file_size, (magic_number == kBlockBasedTableMagicNumber) ? &prefetch_buffer - : nullptr) - .ok()) { + : nullptr); + if (s.ok()) { s = SetTableOptionsByMagicNumber(magic_number); if (s.ok()) { if (table_properties_ && !table_properties_->comparator_name.empty()) { @@ -154,10 +157,16 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { } } } - } else { - s = SetOldTableOptions(); } options_.comparator = internal_comparator_.user_comparator(); + + { + Status status = ReadMetaIndexBlockInFile( + file_.get(), file_size, magic_number, ImmutableOptions(options_), + ReadOptions(), &meta_index_contents_); + // Ignore any errors since this is required for a specific CLI option + status.PermitUncheckedError(); + } } if (s.ok()) { @@ -172,7 +181,8 @@ Status SstFileDumper::NewTableReader( const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, std::unique_ptr* /*table_reader*/) { auto t_opt = TableReaderOptions( - ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_, + ioptions_, moptions_.prefix_extractor, + moptions_.compression_manager.get(), soptions_, internal_comparator_, 0 /* block_protection_bytes_per_key */, false /* skip_filters */, false /* immortal */, true /* force_direct_prefetch */, -1 /* level */, nullptr /* block_cache_tracer */, 0 /* max_file_size_for_l0_meta_pin */, @@ -211,7 +221,7 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) { Env* env = options_.env; Status s = env->NewWritableFile(out_filename, &out_file, soptions_); if (s.ok()) { - s = table_reader_->DumpTable(out_file.get()); + s = table_reader_->DumpTable(out_file.get(), show_sequence_number_type_); } if (!s.ok()) { // close the file before return error, ignore the close error if there's any @@ -222,8 +232,9 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) { } Status SstFileDumper::CalculateCompressedTableSize( - const TableBuilderOptions& tb_options, size_t block_size, - uint64_t* num_data_blocks, uint64_t* compressed_table_size) { + const TableBuilderOptions& tb_options, TableProperties* props, + std::chrono::microseconds* write_time, + std::chrono::microseconds* read_time) { std::unique_ptr env(NewMemEnv(options_.env)); std::unique_ptr dest_writer; Status s = @@ -232,12 +243,11 @@ Status SstFileDumper::CalculateCompressedTableSize( if (!s.ok()) { return s; } - BlockBasedTableOptions table_options; - table_options.block_size = block_size; - BlockBasedTableFactory block_based_tf(table_options); - std::unique_ptr table_builder; - table_builder.reset( - block_based_tf.NewTableBuilder(tb_options, dest_writer.get())); + std::chrono::steady_clock::time_point start = + std::chrono::steady_clock::now(); + std::unique_ptr table_builder{ + tb_options.moptions.table_factory->NewTableBuilder(tb_options, + dest_writer.get())}; std::unique_ptr iter(table_reader_->NewIterator( read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool)); @@ -248,56 +258,112 @@ Status SstFileDumper::CalculateCompressedTableSize( if (!s.ok()) { return s; } + iter.reset(); s = table_builder->Finish(); + *write_time = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + if (!s.ok()) { + return s; + } + s = dest_writer->Close({}); + if (!s.ok()) { + return s; + } + dest_writer.reset(); + *props = table_builder->GetTableProperties(); + start = std::chrono::steady_clock::now(); + TableReaderOptions reader_options(ioptions_, moptions_.prefix_extractor, + moptions_.compression_manager.get(), + soptions_, internal_comparator_, + 0 /* block_protection_bytes_per_key */); + std::unique_ptr file_reader; + s = RandomAccessFileReader::Create(env->GetFileSystem(), testFileName, + soptions_, &file_reader, /*dbg=*/nullptr); + if (!s.ok()) { + return s; + } + std::unique_ptr table_reader; + s = tb_options.moptions.table_factory->NewTableReader( + reader_options, std::move(file_reader), table_builder->FileSize(), + &table_reader); if (!s.ok()) { return s; } - *compressed_table_size = table_builder->FileSize(); - assert(num_data_blocks != nullptr); - *num_data_blocks = table_builder->GetTableProperties().num_data_blocks; + iter.reset(table_reader->NewIterator( + read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + } + s = iter->status(); + if (!s.ok()) { + return s; + } + iter.reset(); + table_reader.reset(); + file_reader.reset(); + *read_time = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); return env->DeleteFile(testFileName); } Status SstFileDumper::ShowAllCompressionSizes( - size_t block_size, - const std::vector>& - compression_types, - int32_t compress_level_from, int32_t compress_level_to, - uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes, - uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) { - fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size); - for (auto& i : compression_types) { - if (CompressionTypeSupported(i.first)) { - fprintf(stdout, "Compression: %-24s\n", i.second); - CompressionOptions compress_opt; - compress_opt.max_dict_bytes = max_dict_bytes; - compress_opt.zstd_max_train_bytes = zstd_max_train_bytes; - compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes; - compress_opt.use_zstd_dict_trainer = use_zstd_dict_trainer; + const std::vector& compression_types, + int32_t compress_level_from, int32_t compress_level_to) { +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif + BlockBasedTableOptions bbto; + if (options_.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { + bbto = *(static_cast_with_check( + options_.table_factory.get())) + ->GetOptions(); + } + + for (CompressionType ctype : compression_types) { + std::string cname; + if (!GetStringFromCompressionType(&cname, ctype).ok()) { + // Can produce names like "Reserved4F" for unrecognized values + cname = CompressionTypeToString(ctype); + } + if (options_.compression_manager + ? options_.compression_manager->SupportsCompressionType(ctype) + : CompressionTypeSupported(ctype)) { + CompressionOptions compress_opt = options_.compression_opts; + fprintf(stdout, + "Compression: %-24s Block Size: %" PRIu64 " Threads: %u\n", + cname.c_str(), bbto.block_size, compress_opt.parallel_threads); for (int32_t j = compress_level_from; j <= compress_level_to; j++) { - fprintf(stdout, "Compression level: %d", j); + fprintf(stdout, "Cx level: %d", j); compress_opt.level = j; - Status s = ShowCompressionSize(block_size, i.first, compress_opt); + Status s = ShowCompressionSize(ctype, compress_opt); if (!s.ok()) { return s; } } } else { - fprintf(stdout, "Unsupported compression type: %s.\n", i.second); + fprintf(stdout, "Unsupported compression type: %s.\n", cname.c_str()); } } return Status::OK(); } Status SstFileDumper::ShowCompressionSize( - size_t block_size, CompressionType compress_type, - const CompressionOptions& compress_opt) { - Options opts; + CompressionType compress_type, const CompressionOptions& compress_opt) { + Options opts = options_; // Use compression_manager etc. opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); opts.statistics->set_stats_level(StatsLevel::kAll); + if (!opts.table_factory->IsInstanceOf(TableFactory::kBlockBasedTableName())) { + // Currently need block-based table for compression + opts.table_factory = std::make_shared(); + } + + // Create internal Options types const ImmutableOptions imoptions(opts); const ColumnFamilyOptions cfo(opts); const MutableCFOptions moptions(cfo); + // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; const WriteOptions write_options; @@ -312,24 +378,27 @@ Status SstFileDumper::ShowCompressionSize( &block_based_table_factories, compress_type, compress_opt, TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, column_family_name, unknown_level, kUnknownNewestKeyTime); - uint64_t num_data_blocks = 0; - std::chrono::steady_clock::time_point start = - std::chrono::steady_clock::now(); - uint64_t file_size; - Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks, - &file_size); + TableProperties props; + std::chrono::microseconds write_time; + std::chrono::microseconds read_time; + Status s = + CalculateCompressedTableSize(tb_opts, &props, &write_time, &read_time); if (!s.ok()) { return s; } - std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); - fprintf(stdout, " Size: %10" PRIu64, file_size); - fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks); - fprintf(stdout, " Time Taken: %10s microsecs", - std::to_string( - std::chrono::duration_cast(end - start) - .count()) + uint64_t num_data_blocks = props.num_data_blocks; + + fprintf(stdout, " Cx size: %10" PRIu64, props.data_size); + fprintf(stdout, " Uncx size: %10" PRIu64, props.uncompressed_data_size); + fprintf(stdout, " Ratio: %10s", + std::to_string(static_cast(props.uncompressed_data_size) / + static_cast(props.data_size)) .c_str()); + fprintf(stdout, " Write usec: %10s ", + std::to_string(write_time.count()).c_str()); + fprintf(stdout, " Read usec: %10s ", + std::to_string(read_time.count()).c_str()); const uint64_t compressed_blocks = opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED); const uint64_t not_compressed_blocks = @@ -359,11 +428,11 @@ Status SstFileDumper::ShowCompressionSize( : ((static_cast(not_compressed_blocks) / static_cast(num_data_blocks)) * 100.0); - fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks, + fprintf(stdout, " Cx count: %6" PRIu64 " (%5.1f%%)", compressed_blocks, compressed_pcnt); - fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)", + fprintf(stdout, " Not cx for ratio: %6" PRIu64 " (%5.1f%%)", ratio_not_compressed_blocks, ratio_not_compressed_pcnt); - fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n", + fprintf(stdout, " Not cx otherwise: %6" PRIu64 " (%5.1f%%)\n", not_compressed_blocks, not_compressed_pcnt); return Status::OK(); } @@ -389,16 +458,22 @@ Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number, Status SstFileDumper::SetTableOptionsByMagicNumber( uint64_t table_magic_number) { assert(table_properties_); - if (table_magic_number == kBlockBasedTableMagicNumber || - table_magic_number == kLegacyBlockBasedTableMagicNumber) { - BlockBasedTableFactory* bbtf = new BlockBasedTableFactory(); + if (table_magic_number == kBlockBasedTableMagicNumber) { + // Preserve BlockBasedTableOptions on options_ when possible + if (!options_.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { + options_.table_factory = std::make_shared(); + } + + BlockBasedTableFactory* bbtf = + static_cast_with_check( + options_.table_factory.get()); // To force tail prefetching, we fake reporting two useful reads of 512KB // from the tail. // It needs at least two data points to warm up the stats. bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024); bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024); - options_.table_factory.reset(bbtf); if (!silent_) { fprintf(stdout, "Sst file format: block-based\n"); } @@ -448,16 +523,6 @@ Status SstFileDumper::SetTableOptionsByMagicNumber( return Status::OK(); } -Status SstFileDumper::SetOldTableOptions() { - assert(table_properties_ == nullptr); - options_.table_factory = std::make_shared(); - if (!silent_) { - fprintf(stdout, "Sst file format: block-based(old version)\n"); - } - - return Status::OK(); -} - Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num_limit, bool has_from, const std::string& from_key, bool has_to, const std::string& to_key, @@ -474,12 +539,11 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num_limit, const Comparator* ucmp = internal_comparator_.user_comparator(); size_t ts_sz = ucmp->timestamp_size(); - Slice from_slice = from_key; - Slice to_slice = to_key; + OptSlice from_opt = has_from ? from_key : OptSlice{}; + OptSlice to_opt = has_to ? to_key : OptSlice{}; std::string from_key_buf, to_key_buf; - auto [from, to] = MaybeAddTimestampsToRange( - has_from ? &from_slice : nullptr, has_to ? &to_slice : nullptr, ts_sz, - &from_key_buf, &to_key_buf); + auto [from, to] = MaybeAddTimestampsToRange(from_opt, to_opt, ts_sz, + &from_key_buf, &to_key_buf); uint64_t i = 0; if (from.has_value()) { InternalKey ikey; diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h index a1a857115a8b..b7d9e4003b83 100644 --- a/table/sst_file_dumper.h +++ b/table/sst_file_dumper.h @@ -21,7 +21,8 @@ class SstFileDumper { bool verify_checksum, bool output_hex, bool decode_blob_index, const EnvOptions& soptions = EnvOptions(), - bool silent = false); + bool silent = false, + bool show_sequence_number_type = false); // read_num_limit limits the total number of keys read. If read_num_limit = 0, // then there is no limit. If read_num_limit = 0 or @@ -43,16 +44,14 @@ class SstFileDumper { Status getStatus() { return init_result_; } Status ShowAllCompressionSizes( - size_t block_size, - const std::vector>& - compression_types, - int32_t compress_level_from, int32_t compress_level_to, - uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes, - uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer); - - Status ShowCompressionSize(size_t block_size, CompressionType compress_type, + const std::vector& compression_types, + int32_t compress_level_from, int32_t compress_level_to); + + Status ShowCompressionSize(CompressionType compress_type, const CompressionOptions& compress_opt); + BlockContents& GetMetaIndexContents() { return meta_index_contents_; } + private: // Get the TableReader implementation for the sst file Status GetTableReader(const std::string& file_path); @@ -61,12 +60,11 @@ class SstFileDumper { FilePrefetchBuffer* prefetch_buffer); Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options, - size_t block_size, - uint64_t* num_data_blocks, - uint64_t* compressed_table_size); + TableProperties* props, + std::chrono::microseconds* write_time, + std::chrono::microseconds* read_time); Status SetTableOptionsByMagicNumber(uint64_t table_magic_number); - Status SetOldTableOptions(); // Helper function to call the factory with settings specific to the // factory implementation @@ -81,6 +79,7 @@ class SstFileDumper { Temperature file_temp_; bool output_hex_; bool decode_blob_index_; + bool show_sequence_number_type_; EnvOptions soptions_; // less verbose in stdout/stderr bool silent_; @@ -98,6 +97,7 @@ class SstFileDumper { ReadOptions read_options_; InternalKeyComparator internal_comparator_; std::unique_ptr table_properties_; + BlockContents meta_index_contents_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index a970666affa5..e63e67c92e1a 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -11,6 +11,7 @@ #include "file/random_access_file_reader.h" #include "options/cf_options.h" #include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" #include "rocksdb/file_system.h" #include "table/get_context.h" #include "table/table_builder.h" @@ -51,6 +52,7 @@ Status SstFileReader::Open(const std::string& file_path) { std::unique_ptr file; std::unique_ptr file_reader; FileOptions fopts(r->soptions); + fopts.file_checksum_func_name = kNoFileChecksumFuncName; const auto& fs = r->options.env->GetFileSystem(); s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr); @@ -62,7 +64,8 @@ Status SstFileReader::Open(const std::string& file_path) { } if (s.ok()) { TableReaderOptions t_opt( - r->ioptions, r->moptions.prefix_extractor, r->soptions, + r->ioptions, r->moptions.prefix_extractor, + r->moptions.compression_manager.get(), r->soptions, r->ioptions.internal_comparator, r->moptions.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, @@ -166,11 +169,11 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) { ? roptions.snapshot->GetSequenceNumber() : kMaxSequenceNumber; ArenaWrappedDBIter* res = new ArenaWrappedDBIter(); - res->Init( - r->options.env, roptions, r->ioptions, r->moptions, nullptr /* version */, - sequence, r->moptions.max_sequential_skip_in_iterations, - 0 /* version_number */, nullptr /* read_callback */, nullptr /* cfh */, - true /* expose_blob_index */, false /* allow_refresh */); + res->Init(r->options.env, roptions, r->ioptions, r->moptions, + nullptr /* version */, sequence, 0 /* version_number */, + nullptr /* read_callback */, nullptr /* cfh */, + true /* expose_blob_index */, false /* allow_refresh */, + /*active_mem=*/nullptr); auto internal_iter = r->table_reader->NewIterator( res->GetReadOptions(), r->moptions.prefix_extractor.get(), res->GetArena(), false /* skip_filters */, diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc index 2d169d6f3bee..439ac66b1963 100644 --- a/table/sst_file_reader_test.cc +++ b/table/sst_file_reader_test.cc @@ -164,7 +164,7 @@ TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) { Options options; options.create_if_missing = true; std::string db_name = test::PerThreadDBPath("test_db"); - DB* db; + std::unique_ptr db; ASSERT_OK(DB::Open(options, db_name, &db)); // Bump sequence number. ASSERT_OK(db->Put(WriteOptions(), keys[0], "foo")); @@ -186,7 +186,7 @@ TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) { } } ASSERT_FALSE(ingested_file.empty()); - delete db; + db.reset(); // Verify the file can be open and read by SstFileReader. CheckFile(db_name + ingested_file, keys, true /* check_global_seqno */); diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 8d1b03380d40..cf6c32cdf7da 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -30,7 +30,7 @@ const size_t kFadviseTrigger = 1024 * 1024; // 1MB struct SstFileWriter::Rep { Rep(const EnvOptions& _env_options, const Options& options, Env::IOPriority _io_priority, const Comparator* _user_comparator, - ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters, + ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, std::string _db_session_id) : env_options(_env_options), ioptions(options), @@ -39,7 +39,6 @@ struct SstFileWriter::Rep { internal_comparator(_user_comparator), cfh(_cfh), invalidate_page_cache(_invalidate_page_cache), - skip_filters(_skip_filters), db_session_id(_db_session_id), ts_sz(_user_comparator->timestamp_size()), strip_timestamp(ts_sz > 0 && @@ -67,7 +66,6 @@ struct SstFileWriter::Rep { // The size of the file during the last time we called Fadvise to remove // cached pages from page cache. uint64_t last_fadvise_size = 0; - bool skip_filters; std::string db_session_id; uint64_t next_file_number = 1; size_t ts_sz; @@ -305,9 +303,9 @@ SstFileWriter::SstFileWriter(const EnvOptions& env_options, const Comparator* user_comparator, ColumnFamilyHandle* column_family, bool invalidate_page_cache, - Env::IOPriority io_priority, bool skip_filters) + Env::IOPriority io_priority) : rep_(new Rep(env_options, options, io_priority, user_comparator, - column_family, invalidate_page_cache, skip_filters, + column_family, invalidate_page_cache, DBImpl::GenerateDbSessionId(options.env))) { // SstFileWriter is used to create sst files that can be added to database // later. Therefore, no real db_id and db_session_id are associated with it. @@ -403,9 +401,6 @@ Status SstFileWriter::Open(const std::string& file_path, Temperature temp) { // assign fake file numbers to each file (into table properties) and keep // the same session id for the life of the SstFileWriter. r->next_file_number++; - // XXX: when we can remove skip_filters from the SstFileWriter public API - // we can remove it from TableBuilderOptions. - table_builder_options.skip_filters = r->skip_filters; FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types; r->file_writer.reset(new WritableFileWriter( std::move(sst_file), file_path, r->env_options, r->ioptions.clock, @@ -424,10 +419,6 @@ Status SstFileWriter::Open(const std::string& file_path, Temperature temp) { return s; } -Status SstFileWriter::Add(const Slice& user_key, const Slice& value) { - return rep_->Add(user_key, value, ValueType::kTypeValue); -} - Status SstFileWriter::Put(const Slice& user_key, const Slice& value) { return rep_->Add(user_key, value, ValueType::kTypeValue); } @@ -472,6 +463,7 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { } if (r->file_info.num_entries == 0 && r->file_info.num_range_del_entries == 0) { + r->builder->status().PermitUncheckedError(); return Status::InvalidArgument("Cannot create sst file with no entries"); } @@ -495,7 +487,10 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { r->file_writer->GetFileChecksumFuncName(); } if (!s.ok()) { - r->ioptions.env->DeleteFile(r->file_info.file_path); + Status status = r->ioptions.env->DeleteFile(r->file_info.file_path); + // Silence ASSERT_STATUS_CHECKED warning, since DeleteFile may fail under + // some error injection, and we can just ignore the failure + status.PermitUncheckedError(); } if (file_info != nullptr) { diff --git a/table/table_builder.h b/table/table_builder.h index 5ed7aba51f3d..ec9f61bbf98b 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -24,6 +24,7 @@ #include "rocksdb/table_properties.h" #include "table/unique_id_impl.h" #include "trace_replay/block_cache_tracer.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { @@ -35,6 +36,7 @@ struct TableReaderOptions { TableReaderOptions( const ImmutableOptions& _ioptions, const std::shared_ptr& _prefix_extractor, + UnownedPtr _compression_manager, const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, uint8_t _block_protection_bytes_per_key, bool _skip_filters = false, @@ -46,6 +48,7 @@ struct TableReaderOptions { uint64_t _tail_size = 0, bool _user_defined_timestamps_persisted = true) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), + compression_manager(_compression_manager), env_options(_env_options), internal_comparator(_internal_comparator), skip_filters(_skip_filters), @@ -64,6 +67,9 @@ struct TableReaderOptions { const ImmutableOptions& ioptions; const std::shared_ptr& prefix_extractor; + // NOTE: the compression manager is not saved, just potentially a decompressor + // from it, so we don't need a shared_ptr copy + UnownedPtr compression_manager; const EnvOptions& env_options; const InternalKeyComparator& internal_comparator; // This is only used for BlockBasedTable (reader) @@ -158,10 +164,6 @@ struct TableBuilderOptions : public TablePropertiesCollectorFactory::Context { const TableFileCreationReason reason; // END for FilterBuildingContext - // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you - // want to skip filters, that should be (for example) null filter_policy - // in the table options of the ioptions.table_factory - bool skip_filters = false; const uint64_t cur_file_num; }; @@ -207,6 +209,9 @@ class TableBuilder { return NumEntries() == 0 && GetTableProperties().num_range_deletions == 0; } + // Size of the file before its content is compressed. + virtual uint64_t PreCompressionSize() const { return 0; } + // Size of the file generated so far. If invoked after a successful // Finish() call, returns the size of the final generated file. virtual uint64_t FileSize() const = 0; @@ -216,6 +221,11 @@ class TableBuilder { // is enabled. virtual uint64_t EstimatedFileSize() const { return FileSize(); } + // Estimated tail size of the SST file generated so far. The "tail" refers to + // all blocks written after data blocks (index + filter). This value helps + // estimate the total file size when deciding when to cut files. + virtual uint64_t EstimatedTailSize() const { return 0; } + virtual uint64_t GetTailSize() const { return 0; } // If the user defined table properties collector suggest the file to @@ -236,6 +246,11 @@ class TableBuilder { virtual void SetSeqnoTimeTableProperties( const SeqnoToTimeMapping& /*relevant_mapping*/, uint64_t /*oldest_ancestor_time*/) {} + + // If this builder used CPU work from threads other than the caller, return + // the CPU microseconds used. 0 = no work outside calling thread, or not + // supported. + virtual uint64_t GetWorkerCPUMicros() const { return 0; } }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_properties.cc b/table/table_properties.cc index 7fee67d1e928..48886c873fb7 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -65,6 +65,8 @@ std::string TableProperties::ToString(const std::string& prop_delim, prop_delim, kv_delim); AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); + AppendProperty(result, "data uncompressed size", uncompressed_data_size, + prop_delim, kv_delim); char index_block_size_str[80]; snprintf(index_block_size_str, sizeof(index_block_size_str), "index block size (user-key? %d, delta-value? %d)", @@ -116,6 +118,8 @@ std::string TableProperties::ToString(const std::string& prop_delim, prop_delim, kv_delim); AppendProperty(result, "largest sequence number in file", key_largest_seqno, prop_delim, kv_delim); + AppendProperty(result, "smallest sequence number in file", key_smallest_seqno, + prop_delim, kv_delim); AppendProperty( result, "merge operator name", @@ -178,6 +182,7 @@ std::string TableProperties::ToString(const std::string& prop_delim, void TableProperties::Add(const TableProperties& tp) { data_size += tp.data_size; + uncompressed_data_size += tp.uncompressed_data_size; index_size += tp.index_size; index_partitions += tp.index_partitions; top_level_index_size += tp.top_level_index_size; @@ -202,6 +207,7 @@ std::map TableProperties::GetAggregatablePropertiesAsMap() const { std::map rv; rv["data_size"] = data_size; + rv["uncompressed_data_size"] = uncompressed_data_size; rv["index_size"] = index_size; rv["index_partitions"] = index_partitions; rv["top_level_index_size"] = top_level_index_size; @@ -320,6 +326,8 @@ const std::string TablePropertiesNames::kUserDefinedTimestampsPersisted = "rocksdb.user.defined.timestamps.persisted"; const std::string TablePropertiesNames::kKeyLargestSeqno = "rocksdb.key.largest.seqno"; +const std::string TablePropertiesNames::kKeySmallestSeqno = + "rocksdb.key.smallest.seqno"; static std::unordered_map table_properties_type_info = { @@ -330,6 +338,10 @@ static std::unordered_map {"data_size", {offsetof(struct TableProperties, data_size), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"uncompressed_data_size", + {offsetof(struct TableProperties, uncompressed_data_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"index_size", {offsetof(struct TableProperties, index_size), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, @@ -434,6 +446,10 @@ static std::unordered_map {offsetof(struct TableProperties, key_largest_seqno), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"key_smallest_seqno", + {offsetof(struct TableProperties, key_smallest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"db_id", {offsetof(struct TableProperties, db_id), OptionType::kEncodedString}}, {"db_session_id", diff --git a/table/table_reader.h b/table/table_reader.h index a9d46499bd06..4363755210fa 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -179,13 +179,15 @@ class TableReader { } // convert db file to a human readable form - virtual Status DumpTable(WritableFile* /*out_file*/) { + virtual Status DumpTable(WritableFile* /*out_file*/, + bool /*show_sequence_number_type*/ = false) { return Status::NotSupported("DumpTable() not supported"); } // check whether there is corruption in this db file virtual Status VerifyChecksum(const ReadOptions& /*read_options*/, - TableReaderCaller /*caller*/) { + TableReaderCaller /*caller*/, + bool /*meta_blocks_only*/ = false) { return Status::NotSupported("VerifyChecksum() not supported"); } diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index a588f6eea07c..ce2e81ddecef 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -84,7 +84,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, Env* env = Env::Default(); auto* clock = env->GetSystemClock().get(); TableBuilder* tb = nullptr; - DB* db = nullptr; + std::unique_ptr db; Status s; const ImmutableOptions ioptions(opts); const ColumnFamilyOptions cfo(opts); @@ -145,8 +145,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::unique_ptr file_reader( new RandomAccessFileReader(std::move(raf), file_name)); s = opts.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor, env_options, - ikc, 0 /* block_protection_bytes_per_key */), + TableReaderOptions(ioptions, moptions.prefix_extractor, + moptions.compression_manager.get(), env_options, ikc, + 0 /* block_protection_bytes_per_key */), std::move(file_reader), file_size, &table_reader); if (!s.ok()) { fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str()); @@ -256,8 +257,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, if (!through_db) { env->DeleteFile(file_name); } else { - delete db; - db = nullptr; + db.reset(); DestroyDB(dbname, opts); } } diff --git a/table/table_test.cc b/table/table_test.cc index 7441b0ff706b..e49b3ecf5b35 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include "db/write_batch_internal.h" #include "memtable/stl_wrappers.h" #include "monitoring/statistics_impl.h" +#include "options/cf_options.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" @@ -36,7 +38,7 @@ #include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "rocksdb/external_table_reader.h" +#include "rocksdb/external_table.h" #include "rocksdb/file_checksum.h" #include "rocksdb/file_system.h" #include "rocksdb/filter_policy.h" @@ -50,6 +52,8 @@ #include "rocksdb/table_properties.h" #include "rocksdb/trace_record.h" #include "rocksdb/unique_id.h" +#include "rocksdb/user_defined_index.h" +#include "rocksdb/utilities/object_registry.h" #include "rocksdb/write_buffer_manager.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_builder.h" @@ -70,8 +74,9 @@ #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" -#include "util/coding_lean.h" +#include "util/coding.h" #include "util/compression.h" +#include "util/defer.h" #include "util/file_checksum_helper.h" #include "util/random.h" #include "util/string_util.h" @@ -83,6 +88,7 @@ namespace ROCKSDB_NAMESPACE { namespace { const std::string kDummyValue(10000, 'o'); +constexpr auto kVerbose = false; // DummyPropertiesCollector used to test BlockBasedTableProperties class DummyPropertiesCollector : public TablePropertiesCollector { @@ -443,7 +449,8 @@ class TableConstructor : public Constructor { file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); return moptions.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, + TableReaderOptions(ioptions, moptions.prefix_extractor, + moptions.compression_manager.get(), soptions, *last_internal_comparator_, 0 /* block_protection_bytes_per_key */, /*skip_filters*/ false, @@ -576,18 +583,16 @@ class DBConstructor : public Constructor { public: explicit DBConstructor(const Comparator* cmp) : Constructor(cmp), comparator_(cmp) { - db_ = nullptr; NewDB(); } - ~DBConstructor() override { delete db_; } + ~DBConstructor() override {} Status FinishImpl(const Options& /*options*/, const ImmutableOptions& /*ioptions*/, const MutableCFOptions& /*moptions*/, const BlockBasedTableOptions& /*table_options*/, const InternalKeyComparator& /*internal_comparator*/, const stl_wrappers::KVMap& kv_map) override { - delete db_; - db_ = nullptr; + db_.reset(); NewDB(); for (const auto& kv : kv_map) { WriteBatch batch; @@ -602,7 +607,7 @@ class DBConstructor : public Constructor { return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions())); } - DB* db() const override { return db_; } + DB* db() const override { return db_.get(); } private: void NewDB() { @@ -621,7 +626,7 @@ class DBConstructor : public Constructor { } const Comparator* comparator_; - DB* db_; + std::unique_ptr db_; }; enum TestType { @@ -668,35 +673,6 @@ static std::vector GenerateArgList() { std::vector restart_intervals = {16, 1, 1024}; std::vector compression_parallel_threads = {1, 4}; - // Only add compression if it is supported - std::vector> compression_types; - compression_types.emplace_back(kNoCompression, false); - if (Snappy_Supported()) { - compression_types.emplace_back(kSnappyCompression, false); - } - if (Zlib_Supported()) { - compression_types.emplace_back(kZlibCompression, false); - compression_types.emplace_back(kZlibCompression, true); - } - if (BZip2_Supported()) { - compression_types.emplace_back(kBZip2Compression, false); - compression_types.emplace_back(kBZip2Compression, true); - } - if (LZ4_Supported()) { - compression_types.emplace_back(kLZ4Compression, false); - compression_types.emplace_back(kLZ4Compression, true); - compression_types.emplace_back(kLZ4HCCompression, false); - compression_types.emplace_back(kLZ4HCCompression, true); - } - if (XPRESS_Supported()) { - compression_types.emplace_back(kXpressCompression, false); - compression_types.emplace_back(kXpressCompression, true); - } - if (ZSTD_Supported()) { - compression_types.emplace_back(kZSTD, false); - compression_types.emplace_back(kZSTD, true); - } - for (auto test_type : test_types) { for (auto reverse_compare : reverse_compare_types) { if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX || @@ -707,9 +683,9 @@ static std::vector GenerateArgList() { one_arg.type = test_type; one_arg.reverse_compare = reverse_compare; one_arg.restart_interval = restart_intervals[0]; - one_arg.compression = compression_types[0].first; + one_arg.compression = kNoCompression; one_arg.compression_parallel_threads = 1; - one_arg.format_version = 0; + one_arg.format_version = 0; // Plain tables use their own versioning one_arg.use_mmap = true; test_args.push_back(one_arg); one_arg.use_mmap = false; @@ -718,17 +694,20 @@ static std::vector GenerateArgList() { } for (auto restart_interval : restart_intervals) { - for (auto compression_type : compression_types) { + for (auto compression_type : GetSupportedCompressions()) { for (auto num_threads : compression_parallel_threads) { - TestArgs one_arg; - one_arg.type = test_type; - one_arg.reverse_compare = reverse_compare; - one_arg.restart_interval = restart_interval; - one_arg.compression = compression_type.first; - one_arg.compression_parallel_threads = num_threads; - one_arg.format_version = compression_type.second ? 2 : 1; - one_arg.use_mmap = false; - test_args.push_back(one_arg); + // format_version = 7 changes some compression handling + for (uint32_t fv : {kMinSupportedBbtFormatVersionForRead, 7U}) { + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_interval; + one_arg.compression = compression_type; + one_arg.compression_parallel_threads = num_threads; + one_arg.format_version = fv; + one_arg.use_mmap = false; + test_args.push_back(one_arg); + } } } } @@ -761,9 +740,6 @@ class FixedOrLessPrefixTransform : public SliceTransform { bool InDomain(const Slice& /*src*/) const override { return true; } - bool InRange(const Slice& dst) const override { - return (dst.size() <= prefix_len_); - } bool FullLengthEnabled(size_t* /*len*/) const override { return false; } }; @@ -929,7 +905,6 @@ class HarnessTest : public testing::Test { void TestRandomAccess(Random* rnd, const std::vector& keys, const stl_wrappers::KVMap& data) { - static const bool kVerbose = false; InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); stl_wrappers::KVMap::const_iterator model_iter = data.begin(); @@ -1135,15 +1110,20 @@ class TableTest : public testing::Test { class GeneralTableTest : public TableTest {}; class BlockBasedTableTestBase : public TableTest {}; -class BlockBasedTableTest - : public BlockBasedTableTestBase, - virtual public ::testing::WithParamInterface { +class BlockBasedTableTest : public BlockBasedTableTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { public: - BlockBasedTableTest() : format_(GetParam()) { env_ = Env::Default(); } + BlockBasedTableTest() : format_(std::get<0>(GetParam())) { + env_ = Env::Default(); + } BlockBasedTableOptions GetBlockBasedTableOptions() { BlockBasedTableOptions options; options.format_version = format_; + auto param = GetParam(); + options.super_block_alignment_size = std::get<1>(param); + options.super_block_alignment_space_overhead_ratio = std::get<2>(param); return options; } @@ -1375,8 +1355,12 @@ class FileChecksumTestHelper { uint64_t FileChecksumTestHelper::checksum_file_num_ = 1; -INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest, - testing::ValuesIn(test::kFooterFormatVersionsToTest)); +INSTANTIATE_TEST_CASE_P( + FormatVersions, BlockBasedTableTest, + testing::Combine(testing::ValuesIn(test::kFooterFormatVersionsToTest), + testing::Values(0, 128 * 1024, 512 * 1024, + 2 * 1024 * 1024), + testing::Values(2048, 32, 128))); // This test serves as the living tutorial for the prefix scan of user collected // properties. @@ -1793,18 +1777,23 @@ TEST_P(BlockBasedTableTest, IndexUncompressed) { #endif // SNAPPY TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) { - TableConstructor c(&reverse_key_comparator); + TableConstructor c(&reverse_key_comparator, + true /* convert_to_internal_key_ */); std::vector keys; stl_wrappers::KVMap kvmap; - { + for (CompressionType ct : {kNoCompression, kSnappyCompression}) { + if (!Snappy_Supported() && ct == kSnappyCompression) { + continue; + } Options options; - options.compression = CompressionType::kNoCompression; + options.compression = ct; BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); + c.Add("blah", std::string(200, 'x')); // something to compress c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1821,7 +1810,13 @@ TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) { // No filter policy is used ASSERT_EQ("", props.filter_policy_name); // Compression type == that set: - ASSERT_EQ("NoCompression", props.compression_name); + if (FormatVersionUsesCompressionManagerName(table_options.format_version)) { + ASSERT_EQ(ct == kNoCompression ? ";;" : "BuiltinV2;01;", + props.compression_name); + } else { + ASSERT_EQ(ct == kNoCompression ? "NoCompression" : "Snappy", + props.compression_name); + } c.ResetTableReader(); } @@ -2044,7 +2039,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) { // Simple PrefetchRange(&c, &opt, &table_options, - /*key_range=*/"k01", "k05", + /*key_begin=*/"k01", /*key_end=*/"k05", /*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"}, /*keys_not_in_cache=*/{"k06", "k07"}); PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"}, @@ -2280,6 +2275,44 @@ TEST_P(BlockBasedTableTest, BadChecksumType) { "Corruption: Corrupt or unsupported checksum type: 123 in test"); } +TEST_P(BlockBasedTableTest, ReservedBitInDataBlockFooter) { + // Test that reserved metadata bits in data block footer are detected. + // We construct a block directly rather than going through the full table + // iterator path to avoid issues with iterator error handling. + + // Build a simple data block + BlockBuilder builder(16 /* restart_interval */); + InternalKey key("abc", 1, kTypeValue); + builder.Add(key.Encode(), "test_value"); + Slice block_contents = builder.Finish(); + std::string block_data = block_contents.ToString(); + + // The footer is the last 4 bytes - corrupt it by setting reserved bit 28 + ASSERT_GE(block_data.size(), sizeof(uint32_t)); + size_t footer_offset = block_data.size() - sizeof(uint32_t); + uint32_t footer = DecodeFixed32(block_data.data() + footer_offset); + footer |= (1u << 28); // Set lowest reserved bit + EncodeFixed32(&block_data[footer_offset], footer); + + // Try to construct a Block from the corrupted data + BlockContents contents(std::move(block_data)); + Block block(std::move(contents), 0 /* read_amp_bytes_per_bit */); + + // Block should have size() == 0 indicating error + ASSERT_EQ(block.size(), 0u); + + // Try to get an iterator - it should be invalid with corruption status + DataBlockIter iter; + block.NewDataIterator(BytewiseComparator(), kMaxSequenceNumber, &iter, + /*stats=*/nullptr, /*block_contents_pinned=*/false); + ASSERT_FALSE(iter.Valid()); + ASSERT_EQ(iter.status().code(), Status::kCorruption) + << iter.status().ToString(); + ASSERT_NE(iter.status().ToString().find("reserved bits set"), + std::string::npos) + << iter.status().ToString(); +} + class BuiltinChecksumTest : public testing::Test, public testing::WithParamInterface {}; @@ -2651,9 +2684,18 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) { c.ResetTableReader(); } -TEST_P(BlockBasedTableTest, BinaryIndexTest) { +TEST_P(BlockBasedTableTest, BinaryIndexTestBinarySearch) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + table_options.index_block_search_type = BlockBasedTableOptions::kBinary; + IndexTest(table_options); +} + +TEST_P(BlockBasedTableTest, BinaryIndexTestInterpolationSearch) { BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); table_options.index_type = BlockBasedTableOptions::kBinarySearch; + table_options.index_block_search_type = + BlockBasedTableOptions::kInterpolation; IndexTest(table_options); } @@ -4701,8 +4743,8 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) { // an arbitrary slice between k04 and k05, either before or after k04a ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 512000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 512000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); c.ResetTableReader(); } @@ -4728,13 +4770,18 @@ static void DoCompressionTest(CompressionType comp) { const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap); + size_t file_size = c.TEST_GetSink()->contents().size(); + EXPECT_EQ(c.ApproximateOffsetOf("abc"), 0); + EXPECT_EQ(c.ApproximateOffsetOf("k01"), 0); + EXPECT_EQ(c.ApproximateOffsetOf("k02"), 0); + EXPECT_NEAR2(c.ApproximateOffsetOf("k03"), file_size / 2, file_size / 10); + EXPECT_NEAR2(c.ApproximateOffsetOf("k04"), file_size / 2, file_size / 10); + EXPECT_NEAR2(c.ApproximateOffsetOf("xyz"), file_size, file_size / 10); + + size_t data_blocks_size = c.GetTableReader()->GetTableProperties()->data_size; + // Near expected compressed size ~= (0.25 + 0.25) * 10000 + EXPECT_NEAR2(data_blocks_size, 5000, 1500); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3555)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3555)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7110)); c.ResetTableReader(); } @@ -4972,30 +5019,11 @@ TEST(TableTest, FooterTests) { BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size); uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5; uint32_t base_context_checksum = 123456789; - { - // legacy block based - FooterBuilder footer; - ASSERT_OK(footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0, - footer_offset, kCRC32c, meta_index, index)); - Footer decoded_footer; - ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); - ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c); - ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); - ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); - ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); - ASSERT_EQ(decoded_footer.format_version(), 0U); - ASSERT_EQ(decoded_footer.base_context_checksum(), 0U); - ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U); - // Ensure serialized with legacy magic - ASSERT_EQ( - DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8), - kLegacyBlockBasedTableMagicNumber); - } - // block based, various checksums, various versions + // block based, various checksums, various versions (format_version >= 2) for (auto t : GetSupportedChecksums()) { - for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) { + for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite; + IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber, fv); + ++fv) { uint32_t maybe_bcc = FormatVersionUsesContextChecksum(fv) ? base_context_checksum : 0U; FooterBuilder footer; @@ -5042,41 +5070,154 @@ TEST(TableTest, FooterTests) { } } + // plain table, various checksums, various versions (format_version >= 2) + // Plain tables have no block trailer (size 0), so set up separate handles + // Note: format_version >= 6 has complex footer checksum requirements, + // so we only test format_version 2-5 for plain tables here { - // legacy plain table - FooterBuilder footer; - ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 0, - footer_offset, kNoChecksum, meta_index)); + uint64_t plain_metaindex_size = r->Uniform(1000000); + // For plain tables: metaindex is at offset 0, footer immediately follows + BlockHandle plain_meta_index(0, plain_metaindex_size); + uint64_t plain_footer_offset = plain_metaindex_size; + for (auto t : GetSupportedChecksums()) { + for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite; fv < 6; ++fv) { + FooterBuilder footer; + ASSERT_OK(footer.Build(kPlainTableMagicNumber, fv, plain_footer_offset, + t, plain_meta_index)); + Footer decoded_footer; + ASSERT_OK( + decoded_footer.DecodeFrom(footer.GetSlice(), plain_footer_offset)); + ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum_type(), t); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), + plain_meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), + plain_meta_index.size()); + ASSERT_EQ(decoded_footer.format_version(), fv); + ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U); + } + } + } +} + +// Test that legacy SST formats (format_version < 2) are properly rejected +TEST(TableTest, LegacyFormatRejectionTests) { + // Temporarily disable unsupported format version allowance for this test + bool& allow = TEST_AllowUnsupportedFormatVersion(); + SaveAndRestore saved_allow(&allow, false); + + // Test legacy block-based magic number from LevelDB should be rejected + { + // Construct a fake footer with legacy block-based magic number + std::array fake_footer; + std::fill(fake_footer.begin(), fake_footer.end(), 0); + // Put legacy magic number at the end + EncodeFixed64(fake_footer.data() + fake_footer.size() - 8, + 0xdb4775248b80fb57ull /*legacy magic number*/); + Footer decoded_footer; - ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); - ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c); - ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); - ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), 0U); - ASSERT_EQ(decoded_footer.index_handle().size(), 0U); - ASSERT_EQ(decoded_footer.format_version(), 0U); - ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U); - // Ensure serialized with legacy magic - ASSERT_EQ( - DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8), - kLegacyPlainTableMagicNumber); + Status s = decoded_footer.DecodeFrom( + Slice(fake_footer.data(), fake_footer.size()), 0); + ASSERT_TRUE(s.IsNotSupported()) << s.ToString(); + ASSERT_TRUE(s.ToString().find("nsupported legacy magic number") != + std::string::npos) + << s.ToString(); + ASSERT_TRUE(s.ToString().find("full compaction") != std::string::npos) + << s.ToString(); + } + + // Test format_version=1 with new magic number should be rejected + { + std::array fake_footer; + std::fill(fake_footer.begin(), fake_footer.end(), 0); + // Part 1: checksum type + fake_footer[0] = kCRC32c; + // Part 3: format_version=1 and new magic number + char* part3 = fake_footer.data() + fake_footer.size() - 12; + EncodeFixed32(part3, 1); // format_version = 1 + EncodeFixed64(part3 + 4, kBlockBasedTableMagicNumber); + + Footer decoded_footer; + Status s = decoded_footer.DecodeFrom( + Slice(fake_footer.data(), fake_footer.size()), 0); + // format_version=1 is not supported for read, should return Corruption + ASSERT_TRUE(s.IsCorruption()) << s.ToString(); + ASSERT_TRUE(s.ToString().find("format_version") != std::string::npos) + << s.ToString(); } + + // Test format_version=0 with new magic number should be rejected { - // xxhash plain table (not currently used) - FooterBuilder footer; - ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 1, - footer_offset, kxxHash, meta_index)); + std::array fake_footer; + std::fill(fake_footer.begin(), fake_footer.end(), 0); + // Part 1: checksum type + fake_footer[0] = kCRC32c; + // Part 3: format_version=0 and new magic number + char* part3 = fake_footer.data() + fake_footer.size() - 12; + EncodeFixed32(part3, 0); // format_version = 0 + EncodeFixed64(part3 + 4, kBlockBasedTableMagicNumber); + Footer decoded_footer; - ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); - ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum_type(), kxxHash); - ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); - ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), 0U); - ASSERT_EQ(decoded_footer.index_handle().size(), 0U); - ASSERT_EQ(decoded_footer.format_version(), 1U); - ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U); + Status s = decoded_footer.DecodeFrom( + Slice(fake_footer.data(), fake_footer.size()), 0); + // format_version=0 is not supported for read, should return Corruption + ASSERT_TRUE(s.IsCorruption()) << s.ToString(); + ASSERT_TRUE(s.ToString().find("format_version") != std::string::npos) + << s.ToString(); + } +} + +// Test that configuring unsupported format_version for writing is sanitized +// or rejected as appropriate +TEST(TableTest, UnsupportedFormatVersionConfigTest) { + // Temporarily disable unsupported format version allowance for this test + bool& allow = TEST_AllowUnsupportedFormatVersion(); + SaveAndRestore saved_allow(&allow, false); + + // Test that format_version < kMinSupportedBbtFormatVersionForWrite is + // sanitized to kMinSupportedBbtFormatVersionForWrite during initialization + for (uint32_t fv = 0; fv < kMinSupportedBbtFormatVersionForWrite; ++fv) { + BlockBasedTableOptions table_options; + table_options.format_version = fv; + BlockBasedTableFactory factory(table_options); + + // After construction, format_version should be sanitized + auto* opts = factory.GetOptions(); + ASSERT_EQ(opts->format_version, kMinSupportedBbtFormatVersionForWrite) + << "format_version=" << fv << " should be sanitized to " + << kMinSupportedBbtFormatVersionForWrite; + } + + // Test that supported format versions are not changed + for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite; + IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber, fv); + ++fv) { + BlockBasedTableOptions table_options; + table_options.format_version = fv; + BlockBasedTableFactory factory(table_options); + + auto* opts = factory.GetOptions(); + ASSERT_EQ(opts->format_version, fv) + << "format_version=" << fv << " should not be changed"; + + ColumnFamilyOptions cf_opts; + DBOptions db_opts; + Status s = factory.ValidateOptions(db_opts, cf_opts); + ASSERT_OK(s) << "format_version=" << fv << ": " << s.ToString(); + } + + // Test that format_version > kLatestBbtFormatVersion is rejected by + // ValidateOptions (not sanitized, since it could be a future version that + // requires newer code) + { + BlockBasedTableOptions table_options; + table_options.format_version = kLatestBbtFormatVersion + 1; + BlockBasedTableFactory factory(table_options); + + ColumnFamilyOptions cf_opts; + DBOptions db_opts; + Status s = factory.ValidateOptions(db_opts, cf_opts); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); } } @@ -5181,10 +5322,6 @@ class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform { return IsValid(src); } - bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override { - return true; - } - bool IsValid(const ROCKSDB_NAMESPACE::Slice& src) const { if (src.size() != 4) { return false; @@ -5222,7 +5359,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) { const std::string kDBPath = test::PerThreadDBPath("table_prefix_test"); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); ASSERT_OK(DestroyDB(kDBPath, options)); - ROCKSDB_NAMESPACE::DB* db; + std::unique_ptr db; ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); // Create a bunch of keys with 10 filters. @@ -5236,7 +5373,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) { // Trigger compaction. ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - delete db; + db.reset(); // In the second round, turn whole_key_filtering off and expect // rocksdb still works. } @@ -5326,7 +5463,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { new RandomAccessFileReader(std::move(source), "")); options.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(), + TableReaderOptions(ioptions, moptions.prefix_extractor, + moptions.compression_manager.get(), EnvOptions(), ikc, 0 /* block_protection_bytes_per_key */), std::move(file_reader), ss_rw.contents().size(), &table_reader); @@ -5501,7 +5639,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { const MutableCFOptions moptions2(options2); ASSERT_OK(moptions.table_factory->NewTableReader( - TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(), + TableReaderOptions(ioptions2, moptions2.prefix_extractor, + moptions2.compression_manager.get(), EnvOptions(), GetPlainInternalComparator(options2.comparator), 0 /* block_protection_bytes_per_key */), std::move(file_reader), sink->contents().size(), &table_reader)); @@ -5540,7 +5679,7 @@ TEST_P(BlockBasedTableTest, FixBlockAlignMismatchedFileChecksums) { const std::string kDBPath = test::PerThreadDBPath("block_align_padded_bytes_verify_file_checksums"); ASSERT_OK(DestroyDB(kDBPath, options)); - DB* db; + std::unique_ptr db; ASSERT_OK(DB::Open(options, kDBPath, &db)); ASSERT_OK(db->Put(WriteOptions(), "k1", "v1")); ASSERT_OK(db->Flush(FlushOptions())); @@ -5548,7 +5687,7 @@ TEST_P(BlockBasedTableTest, FixBlockAlignMismatchedFileChecksums) { // aligning blocks are used to generate the checksum to compare against the // one not generated by padded bytes ASSERT_OK(db->VerifyFileChecksums(ReadOptions())); - delete db; + db.reset(); } class NoBufferAlignmenttWritableFile : public FSWritableFileOwnerWrapper { @@ -5603,7 +5742,7 @@ TEST_P(BlockBasedTableTest, const std::string kDBPath = test::PerThreadDBPath( "block_align_flush_during_flush_verify_file_checksums"); ASSERT_OK(DestroyDB(kDBPath, options)); - DB* db; + std::unique_ptr db; ASSERT_OK(DB::Open(options, kDBPath, &db)); ASSERT_OK(db->Put(WriteOptions(), "k1", "k2")); @@ -5612,7 +5751,7 @@ TEST_P(BlockBasedTableTest, // Before the fix, VerifyFileChecksums() will fail as incorrect padded bytes // were used to generate checksum upon file creation ASSERT_OK(db->VerifyFileChecksums(ReadOptions())); - delete db; + db.reset(); } TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { @@ -5675,11 +5814,12 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { read_options_for_helper.verify_checksums = false; PersistentCacheOptions cache_options; - BlockFetcher block_fetcher( - file, nullptr /* prefetch_buffer */, footer, read_options_for_helper, - handle, contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, block_type, - UncompressionDict::GetEmptyDict(), cache_options); + auto mgr = GetBuiltinV2CompressionManager(); + BlockFetcher block_fetcher(file, nullptr /* prefetch_buffer */, footer, + read_options_for_helper, handle, contents, + ioptions, false /* decompress */, + false /*maybe_compressed*/, block_type, + mgr->GetDecompressor().get(), cache_options); ASSERT_OK(block_fetcher.ReadBlockContents()); }; @@ -5812,12 +5952,12 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { auto metaindex_handle = footer.metaindex_handle(); BlockContents metaindex_contents; PersistentCacheOptions pcache_opts; + auto mgr = GetBuiltinV2CompressionManager(); BlockFetcher block_fetcher( table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(), metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, false /*maybe_compressed*/, BlockType::kMetaIndex, - UncompressionDict::GetEmptyDict(), pcache_opts, - nullptr /*memory_allocator*/); + mgr->GetDecompressor().get(), pcache_opts, nullptr /*memory_allocator*/); ASSERT_OK(block_fetcher.ReadBlockContents()); Block metaindex_block(std::move(metaindex_contents)); @@ -5894,12 +6034,12 @@ TEST_P(BlockBasedTableTest, SeekMetaBlocks) { auto metaindex_handle = footer.metaindex_handle(); BlockContents metaindex_contents; PersistentCacheOptions pcache_opts; + auto mgr = GetBuiltinV2CompressionManager(); BlockFetcher block_fetcher( table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(), metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, false /*maybe_compressed*/, BlockType::kMetaIndex, - UncompressionDict::GetEmptyDict(), pcache_opts, - nullptr /*memory_allocator*/); + mgr->GetDecompressor().get(), pcache_opts, nullptr /*memory_allocator*/); ASSERT_OK(block_fetcher.ReadBlockContents()); Block metaindex_block(std::move(metaindex_contents)); @@ -5944,27 +6084,25 @@ TEST_P(BlockBasedTableTest, BadOptions) { options.table_factory.reset(NewBlockBasedTableFactory(bbto)); ASSERT_OK(DestroyDB(kDBPath, options)); - std::unique_ptr db; { - ROCKSDB_NAMESPACE::DB* _db; - ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db)); + std::unique_ptr db; + ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); bbto.block_size = 4096; options.compression = kSnappyCompression; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db)); + ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); options.compression = kNoCompression; options.bottommost_compression = kSnappyCompression; - ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db)); + ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); options.bottommost_compression = kNoCompression; options.compression_per_level.emplace_back(kSnappyCompression); - ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db)); + ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); options.compression_per_level.clear(); - ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db)); - db.reset(_db); + ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); } } @@ -6204,6 +6342,12 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) { class ChargeCompressionDictionaryBuildingBufferTest : public BlockBasedTableTestBase {}; TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) { + if (GetSupportedDictCompressions().empty()) { + ROCKSDB_GTEST_SKIP("No supported dict compression"); + return; + } + const auto kCompression = GetSupportedDictCompressions()[0]; + constexpr std::size_t kSizeDummyEntry = 256 * 1024; constexpr std::size_t kMetaDataChargeOverhead = 10000; constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024; @@ -6227,7 +6371,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) { {CacheEntryRole::kCompressionDictionaryBuildingBuffer, {/*.charged = */ charge_compression_dictionary_building_buffer}}); Options options; - options.compression = kSnappyCompression; + options.compression = kCompression; options.compression_opts.max_dict_bytes = kMaxDictBytes; options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -6248,7 +6392,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) { options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, &internal_tbl_prop_coll_factories, - kSnappyCompression, options.compression_opts, + kCompression, options.compression_opts, kUnknownColumnFamily, "test_cf", -1 /* level */, kUnknownNewestKeyTime), file_writer.get())); @@ -6287,6 +6431,12 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) { TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithBufferLimitExceed) { + if (GetSupportedDictCompressions().empty()) { + ROCKSDB_GTEST_SKIP("No supported dict compression"); + return; + } + const auto kCompression = GetSupportedDictCompressions()[0]; + constexpr std::size_t kSizeDummyEntry = 256 * 1024; constexpr std::size_t kMetaDataChargeOverhead = 10000; constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024; @@ -6306,7 +6456,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, std::make_shared(); Options options; - options.compression = kSnappyCompression; + options.compression = kCompression; options.compression_opts.max_dict_bytes = kMaxDictBytes; options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -6325,7 +6475,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, - &internal_tbl_prop_coll_factories, kSnappyCompression, + &internal_tbl_prop_coll_factories, kCompression, options.compression_opts, kUnknownColumnFamily, "test_cf", -1 /* level */, kUnknownNewestKeyTime), file_writer.get())); @@ -6368,6 +6518,12 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, } TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) { + if (GetSupportedDictCompressions().empty()) { + ROCKSDB_GTEST_SKIP("No supported dict compression"); + return; + } + const auto kCompression = GetSupportedDictCompressions()[0]; + constexpr std::size_t kSizeDummyEntry = 256 * 1024; constexpr std::size_t kMetaDataChargeOverhead = 10000; // A small kCacheCapacity is chosen so that increase cache charging for @@ -6393,7 +6549,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) { std::make_shared(); Options options; - options.compression = kSnappyCompression; + options.compression = kCompression; options.compression_opts.max_dict_bytes = kMaxDictBytes; options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -6412,7 +6568,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) { const WriteOptions write_options; std::unique_ptr builder(options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc, - &internal_tbl_prop_coll_factories, kSnappyCompression, + &internal_tbl_prop_coll_factories, kCompression, options.compression_opts, kUnknownColumnFamily, "test_cf", -1 /* level */, kUnknownNewestKeyTime), file_writer.get())); @@ -6525,35 +6681,197 @@ TEST_F(CacheUsageOptionsOverridesTest, SanitizeAndValidateOptions) { Destroy(options); } -class ExternalTableReaderTest : public DBTestBase { +class ExternalTableTest : public DBTestBase { public: - ExternalTableReaderTest() - : DBTestBase("external_table_reader_test", /*env_do_fsync=*/false) {} + ExternalTableTest() + : DBTestBase("external_table_test", /*env_do_fsync=*/false) {} protected: - class DummyExternalTableIterator : public Iterator { + class DummyExternalTableFile { + public: + explicit DummyExternalTableFile(const std::string& file_path, + FSWritableFile* file) + : file_path_(file_path), file_(file), file_size_(0) { + props_.comparator_name = BytewiseComparator()->Name(); + } + + Status Serialize( + const std::vector>& kv_vec) { + // First append the property block if one exists + uint32_t prop_block_size = static_cast(prop_block_.length()); + buf_.append(static_cast(static_cast(&prop_block_size)), + sizeof(prop_block_size)); + if (!prop_block_.empty()) { + buf_.append(prop_block_); + } + for (auto& kv : kv_vec) { + SerializeOne(kv.first, kv.second); + props_.raw_key_size += kv.first.length(); + props_.raw_value_size += kv.second.length(); + } + props_.num_entries = kv_vec.size(); + file_size_ = buf_.length(); + if (file_) { + return file_->Append(buf_, IOOptions(), /*dbg=*/nullptr); + } else { + return WriteStringToFile(Env::Default(), buf_, file_path_); + } + } + + Status Deserialize(std::map& kv_map) { + Status s = ReadFileToString(Env::Default(), file_path_, &buf_); + if (!s.ok()) { + return s; + } + + uint32_t prop_block_size = 0; + buf_.copy(static_cast(static_cast(&prop_block_size)), + sizeof(prop_block_size)); + buf_.erase(0, sizeof(prop_block_size)); + prop_block_.assign(buf_.substr(0, prop_block_size)); + buf_.erase(0, prop_block_size); + while (buf_.length() > 0) { + std::pair kv; + s = DeserializeOne(kv); + if (!s.ok()) { + break; + } + size_t key_size = kv.first.length(); + size_t value_size = kv.second.length(); + kv_map.emplace(std::move(kv)); + props_.raw_key_size += key_size; + props_.raw_value_size += value_size; + } + props_.num_entries = kv_map.size(); + return s; + } + + Status PutPropertiesBlock(const Slice& prop_block) { + prop_block_.assign(prop_block.data(), prop_block.size()); + return Status::OK(); + } + + Status GetPropertiesBlock(std::unique_ptr* block, uint64_t* size, + uint64_t* file_offset) { + if (!prop_block_.empty()) { + *block = std::make_unique(prop_block_.length()); + memcpy(block->get(), prop_block_.data(), prop_block_.length()); + *size = prop_block_.length(); + *file_offset = sizeof(uint32_t); + } else { + *size = 0; + } + return Status::OK(); + } + + TableProperties GetTableProperties() const { return props_; } + + uint64_t FileSize() const { return file_size_; } + + private: + struct ItemHeader { + uint32_t key_size; + uint32_t value_size; + }; + + void SerializeOne(const Slice& key, const Slice& value) { + ItemHeader hdr; + hdr.key_size = static_cast(key.size()); + hdr.value_size = static_cast(value.size()); + buf_.append(static_cast(static_cast(&hdr)), sizeof(hdr)); + buf_.append(key.data(), key.size()); + buf_.append(value.data(), value.size()); + } + + Status DeserializeOne(std::pair& kv) { + ItemHeader hdr; + size_t copied = + buf_.copy(static_cast(static_cast(&hdr)), sizeof(hdr)); + if (copied < sizeof(hdr)) { + return Status::Corruption(); + } + buf_.erase(0, sizeof(hdr)); + if (buf_.length() < hdr.key_size + hdr.value_size) { + return Status::Corruption(); + } + kv.first.assign(std::string_view(buf_.data(), hdr.key_size)); + buf_.erase(0, hdr.key_size); + kv.second.assign(std::string_view(buf_.data(), hdr.value_size)); + buf_.erase(0, hdr.value_size); + return Status::OK(); + } + + std::string file_path_; + FSWritableFile* file_; + std::string buf_; + TableProperties props_; + uint64_t file_size_; + std::string prop_block_; + }; + + class DummyExternalTableIterator : public ExternalTableIterator { public: - explicit DummyExternalTableIterator(bool empty) : empty_(empty) {} + explicit DummyExternalTableIterator( + const ReadOptions& /*ro*/, + const std::map& kv_map) + : scan_options_(nullptr), + num_opts_(0), + scan_idx_(0), + kv_map_(kv_map), + valid_(false) { + TEST_SYNC_POINT_CALLBACK("DummyExternalTableIterator::Constructor", + &status_); + } - bool Valid() const override { return empty_ ? !empty_ : valid_; } + bool Valid() const override { return valid_; } void SeekToFirst() override { - valid_ = true; - status_ = Status::OK(); + if (scan_options_) { + status_ = Status::InvalidArgument(); + } else { + iter_ = kv_map_.begin(); + valid_ = iter_ != kv_map_.end(); + status_ = Status::OK(); + } } void SeekToLast() override { - valid_ = true; - status_ = Status::OK(); + if (scan_options_) { + status_ = Status::InvalidArgument(); + } else { + if (!kv_map_.empty()) { + iter_ = kv_map_.begin(); + for (uint64_t i = 0; i < kv_map_.size() - 1; ++i) { + iter_++; + } + valid_ = true; + } else { + valid_ = false; + } + status_ = Status::OK(); + } } void Seek(const Slice& target) override { - if (target.compare(key_str) <= 0) { - valid_ = true; - } else { - valid_ = false; + if (status_.ok()) { + iter_ = kv_map_.find(target.ToString()); + valid_ = iter_ != kv_map_.end(); + eof_ = iter_ == kv_map_.end(); + } + if (scan_options_) { + if (scan_idx_ >= num_opts_ || + target != scan_options_[scan_idx_].range.start.value().ToString()) { + status_ = Status::InvalidArgument(); + } else { + if (valid_ && scan_options_[scan_idx_].range.limit.has_value() && + iter_->first.compare( + scan_options_[scan_idx_].range.limit.value().ToString()) >= + 0) { + valid_ = false; + } + scan_idx_++; + } } - status_ = Status::OK(); } void SeekForPrev(const Slice& /*target*/) override { @@ -6562,8 +6880,38 @@ class ExternalTableReaderTest : public DBTestBase { } void Next() override { - valid_ = false; - // status_ is still ok. valid_ indicates end of scan + iter_++; + valid_ = iter_ != kv_map_.end(); + eof_ = iter_ == kv_map_.end(); + if (valid_ && scan_options_ && + scan_options_[scan_idx_ - 1].range.limit.has_value() && + iter_->first.compare( + scan_options_[scan_idx_ - 1].range.limit.value().ToString()) >= + 0) { + valid_ = false; + } + // status_ is still ok. !valid_ indicates end of scan + } + + bool NextAndGetResult(IterateResult* result) override { + Next(); + if (valid_) { + result->key = key(); + result->bound_check_result = IterBoundCheck::kInbound; + result->value_prepared = true; + } else { + result->key = Slice(); + result->bound_check_result = + eof_ ? IterBoundCheck::kUnknown : IterBoundCheck::kOutOfBound; + result->value_prepared = false; + } + return valid_; + } + + bool PrepareValue() override { return valid_ ? true : false; } + + IterBoundCheck UpperBoundCheckResult() override { + return eof_ ? IterBoundCheck::kUnknown : IterBoundCheck::kOutOfBound; } void Prev() override { @@ -6573,7 +6921,7 @@ class ExternalTableReaderTest : public DBTestBase { Slice key() const override { // If valid_ is false or status_ is non-ok, behavior is indeterminate - return Slice(key_str); + return Slice(iter_->first); } Status status() const override { @@ -6583,31 +6931,47 @@ class ExternalTableReaderTest : public DBTestBase { Slice value() const override { // If valid_ is false or status_ is non-ok, behavior is indeterminate - return Slice(value_str); + return Slice(iter_->second); } - private: - static const std::string key_str; - static const std::string value_str; + void Prepare(const ScanOptions scan_opts[], size_t num_opts) override { + scan_options_ = scan_opts; + num_opts_ = num_opts; + } + private: + const ScanOptions* scan_options_; + size_t num_opts_; + size_t scan_idx_; + std::map kv_map_; bool valid_ = false; - bool empty_; + bool eof_ = false; Status status_ = Status::OK(); + std::map::iterator iter_; }; class DummyExternalTableReader : public ExternalTableReader { public: - Iterator* NewIterator(const ReadOptions& read_options, - const SliceTransform* /*prefix_extractor*/) override { - return new DummyExternalTableIterator((read_options.weight == 0) ? true - : false); + explicit DummyExternalTableReader(const std::string& file_path, + bool support_property_block) + : file_(file_path, /*file=*/nullptr), + support_property_block_(support_property_block) { + Status s = file_.Deserialize(kv_map_); + EXPECT_OK(s); + } + + ExternalTableIterator* NewIterator( + const ReadOptions& read_options, + const SliceTransform* /*prefix_extractor*/) override { + return new DummyExternalTableIterator(read_options, kv_map_); } Status Get(const ReadOptions& /*read_options*/, const Slice& key, const SliceTransform* /*prefix_extractor*/, std::string* value) override { - if (!key.compare("foo")) { - value->assign("bar"); + auto iter = kv_map_.find(key.ToString()); + if (iter != kv_map_.end()) { + value->assign(iter->second); return Status::OK(); } return Status::NotFound(); @@ -6626,6 +6990,14 @@ class ExternalTableReaderTest : public DBTestBase { } } + Status GetPropertiesBlock(std::unique_ptr* block, uint64_t* size, + uint64_t* file_offset) override { + if (!support_property_block_) { + return Status::NotSupported(); + } + return file_.GetPropertiesBlock(block, size, file_offset); + } + std::shared_ptr GetTableProperties() const override { std::shared_ptr props = std::make_shared(); @@ -6635,39 +7007,115 @@ class ExternalTableReaderTest : public DBTestBase { props->raw_value_size = 3; return props; } + + private: + std::map kv_map_; + DummyExternalTableFile file_; + bool support_property_block_; + }; + + class DummyExternalTableBuilder : public ExternalTableBuilder { + public: + explicit DummyExternalTableBuilder(const std::string& file_path, + FSWritableFile* file, + bool support_property_block) + : file_(file_path, file), + support_property_block_(support_property_block) {} + + void Add(const Slice& key, const Slice& value) override { + if (!kv_vec_.empty()) { + ASSERT_LT(BytewiseComparator()->Compare(kv_vec_.back().first, key), 0); + } + kv_vec_.emplace_back(key.ToString(), value.ToString()); + } + + Status Finish() override { + status_ = file_.Serialize(kv_vec_); + return status_; + } + + void Abandon() override { kv_vec_.clear(); } + + uint64_t FileSize() const override { return file_.FileSize(); } + + Status PutPropertiesBlock(const Slice& block) override { + if (!support_property_block_) { + return Status::NotSupported(); + } + return file_.PutPropertiesBlock(block); + } + + TableProperties GetTableProperties() const override { + return file_.GetTableProperties(); + } + + Status status() const override { return status_; } + + private: + std::vector> kv_vec_; + DummyExternalTableFile file_; + Status status_; + bool support_property_block_; }; class DummyExternalTableFactory : public ExternalTableFactory { public: + explicit DummyExternalTableFactory(bool support_property_block) + : support_property_block_(support_property_block) {} const char* Name() const override { return "DummyExternalTableFactory"; } Status NewTableReader( - const ReadOptions& /*read_options*/, const std::string& /*file_path*/, - const ExternalTableOptions& /*topts*/, - std::unique_ptr* table_reader) override { - table_reader->reset(new DummyExternalTableReader()); + const ReadOptions& /*read_options*/, const std::string& file_path, + const ExternalTableOptions& topts, + std::unique_ptr* table_reader) const override { + // Sanity check some options + EXPECT_EQ(topts.file_options.handoff_checksum_type, + ChecksumType::kCRC32c); + table_reader->reset( + new DummyExternalTableReader(file_path, support_property_block_)); return Status::OK(); } + + ExternalTableBuilder* NewTableBuilder( + const ExternalTableBuilderOptions& /*opts*/, + const std::string& file_path, FSWritableFile* file) const override { + return new DummyExternalTableBuilder(file_path, file, + support_property_block_); + } + + private: + bool support_property_block_; }; }; -const std::string ExternalTableReaderTest::DummyExternalTableIterator::key_str = - "foo"; -const std::string - ExternalTableReaderTest::DummyExternalTableIterator::value_str = "bar"; - -TEST_F(ExternalTableReaderTest, BasicTest) { +TEST_F(ExternalTableTest, BasicTest) { std::shared_ptr factory = - std::make_shared(); + std::make_shared( + /*support_property_block=*/false); + + std::string file_path = test::PerThreadDBPath("external_table"); + { + std::unique_ptr builder; + builder.reset(factory->NewTableBuilder( + ExternalTableBuilderOptions(ReadOptions(), WriteOptions(), + std::shared_ptr(), + BytewiseComparator(), "default", + TableFileCreationReason::kMisc), + file_path, /*file=*/nullptr)); + builder->Add("foo", "bar"); + ASSERT_OK(builder->Finish()); + } std::unique_ptr reader; std::shared_ptr prefix_extractor; ASSERT_OK(factory->NewTableReader( - {}, "", ExternalTableOptions(prefix_extractor, nullptr), &reader)); + {}, file_path, + ExternalTableOptions(prefix_extractor, /*comparator=*/nullptr, + /*fs=*/nullptr, FileOptions()), + &reader)); ReadOptions ro; - ro.weight = 1; - std::unique_ptr iter(reader->NewIterator(ro, nullptr)); + std::unique_ptr iter(reader->NewIterator(ro, nullptr)); ASSERT_NE(iter, nullptr); iter->Seek("foo"); ASSERT_TRUE(iter->Valid() && iter->status().ok()); @@ -6689,25 +7137,32 @@ TEST_F(ExternalTableReaderTest, BasicTest) { ASSERT_EQ(statuses[1], Status::NotFound()); } -TEST_F(ExternalTableReaderTest, SstReaderTest) { +TEST_F(ExternalTableTest, SstReaderTest) { + if (encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment"); + return; + } Options options = GetDefaultOptions(); - std::string dbname = test::PerThreadDBPath("external_table_reader_test"); + std::string dbname = test::PerThreadDBPath("external_table_test"); std::string ingest_file = dbname + "test.immutabledb"; dbname += "_db"; std::shared_ptr factory = - std::make_shared(); + std::make_shared( + /*support_property_block=*/false); options.table_factory = NewExternalTableFactory(factory); - // Create a file - ASSERT_OK(WriteStringToFile(options.env, "Hello World", ingest_file, - /*should_sync=*/true)); + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file)); + ASSERT_OK(writer->Put("foo", "bar")); + ASSERT_OK(writer->Finish()); + writer.reset(); std::unique_ptr reader(new SstFileReader(options)); ASSERT_OK(reader->Open(ingest_file)); ReadOptions ro; - ro.weight = 1; std::unique_ptr iter(reader->NewIterator(ro)); ASSERT_NE(iter, nullptr); iter->Seek("foo"); @@ -6718,9 +7173,2545 @@ TEST_F(ExternalTableReaderTest, SstReaderTest) { ASSERT_TRUE(iter->status().ok()); } -} // namespace ROCKSDB_NAMESPACE +TEST_F(ExternalTableTest, ExternalFileChecksumTest) { + if (encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment"); + return; + } + Options options = GetDefaultOptions(); + std::string dbname = test::PerThreadDBPath("external_table_test"); + std::string ingest_file = dbname + "test.immutable"; + dbname += "_db"; + ASSERT_OK(DestroyDB(dbname, options)); + + std::shared_ptr factory = + std::make_shared( + /*support_property_block=*/true); + options.table_factory = NewExternalTableFactory(factory); + + // Create a file + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file)); + ASSERT_OK(writer->Put("foo", "bar")); + ASSERT_OK(writer->Put("foo2", "bar2")); + ExternalSstFileInfo info; + ASSERT_OK(writer->Finish(&info)); + writer.reset(); + + FileChecksumGenContext cksum_ctx; + FileChecksumGenCrc32c cksum_gen(cksum_ctx); + std::string file_data; + ASSERT_OK(ReadFileToString(options.env, ingest_file, &file_data)); + cksum_gen.Update(file_data.data(), file_data.size()); + cksum_gen.Finalize(); + ASSERT_EQ(info.file_checksum, cksum_gen.GetChecksum()); +} + +TEST_F(ExternalTableTest, DBIterTest) { + if (encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment"); + return; + } + Options options = GetDefaultOptions(); + std::string dbname = test::PerThreadDBPath("external_table_test"); + std::string ingest_file = dbname + "test.immutable"; + dbname += "_db"; + ASSERT_OK(DestroyDB(dbname, options)); + + std::shared_ptr factory = + std::make_shared( + /*support_property_block=*/true); + options.table_factory = NewExternalTableFactory(factory); + + // Create a file + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file)); + ASSERT_OK(writer->Put("foo", "bar")); + ASSERT_OK(writer->Put("foo2", "bar2")); + ASSERT_OK(writer->Finish()); + writer.reset(); + + std::unique_ptr db; + options.create_if_missing = true; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + ifo.allow_db_generated_files = true; + ifo.fill_cache = false; + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_OK(s); + + std::unique_ptr iter(db->NewIterator({}, cfh)); + ASSERT_NE(iter, nullptr); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid() && iter->status().ok()); + ASSERT_EQ(iter->value(), "bar"); + iter->Next(); + ASSERT_TRUE(iter->Valid() && iter->status().ok()); + ASSERT_EQ(iter->key(), "foo2"); + ASSERT_EQ(iter->value(), "bar2"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); +} + +TEST_F(ExternalTableTest, DBMultiScanTest) { + if (encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment"); + return; + } + Options options = GetDefaultOptions(); + std::string dbname = test::PerThreadDBPath("external_table_test"); + std::string ingest_file = dbname + "test.immutable"; + dbname += "_db"; + ASSERT_OK(DestroyDB(dbname, options)); + + std::shared_ptr factory = + std::make_shared( + /*support_property_block=*/true); + options.table_factory = NewExternalTableFactory(factory); + + // Create a file + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file)); + for (int i = 0; i < 100; ++i) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + ASSERT_OK(writer->Put("k" + ss.str(), "val" + ss.str())); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + + std::unique_ptr db; + options.create_if_missing = true; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + ifo.allow_db_generated_files = true; + ifo.fill_cache = false; + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_OK(s); + + std::vector key_ranges({"k03", "k10", "k25", "k50"}); + ReadOptions ro; + MultiScanArgs scan_options(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + std::unique_ptr iter = db->NewMultiScan(ro, cfh, scan_options); + try { + int idx = 0; + int count = 0; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0); + ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0); + count++; + } + idx += 2; + } + ASSERT_EQ(count, 32); + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + std::cerr << "Iterator returned status " << ex.what(); + abort(); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); + + // Test the overlapping scan case + key_ranges[1] = "k30"; + scan_options = MultiScanArgs(BytewiseComparator()); + scan_options.insert(key_ranges[0], key_ranges[1]); + scan_options.insert(key_ranges[2], key_ranges[3]); + + iter = db->NewMultiScan(ro, cfh, scan_options); + try { + int idx = 0; + int count = 0; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0); + ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0); + count++; + } + idx += 2; + } + ASSERT_EQ(count, 52); + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); + + // Test the no limit scan case + scan_options = MultiScanArgs(BytewiseComparator()); + scan_options.insert(key_ranges[0]); + scan_options.insert(key_ranges[2]); + iter = db->NewMultiScan(ro, cfh, scan_options); + try { + int idx = 0; + int count = 0; + for (auto range : *iter) { + for (auto it : range) { + ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0); + if (it.first.ToString().compare(key_ranges[idx + 1]) == 0) { + break; + } + count++; + } + idx += 2; + } + ASSERT_EQ(count, 52); + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); + + SyncPoint::GetInstance()->SetCallBack( + "DummyExternalTableIterator::Constructor", [](void* arg) { + Status* status = static_cast(arg); + *status = Status::IOError(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + iter = db->NewMultiScan(ro, cfh, scan_options); + try { + for (auto range : *iter) { + // Should not get here. Iterator should throw an exception + assert(false); + for (auto it : range) { + (void)it; + assert(false); + } + } + } catch (MultiScanException& ex) { + // Make sure exception contains the status + ASSERT_NOK(ex.status()); + } catch (std::logic_error& ex) { + std::cerr << "Iterator returned logic error " << ex.what(); + abort(); + } + iter.reset(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); +} + +TEST_F(ExternalTableTest, IngestionTest) { + if (encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment"); + return; + } + Options options = GetDefaultOptions(); + std::string dbname = test::PerThreadDBPath("external_table_test"); + std::string ingest_file = dbname + "test.immutable"; + dbname += "_db"; + ASSERT_OK(DestroyDB(dbname, options)); + + std::shared_ptr factory = + std::make_shared( + /*support_property_block=*/true); + options.table_factory = NewExternalTableFactory(factory); + + // Create a file + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file)); + ASSERT_OK(writer->Put("foo", "bar")); + ASSERT_OK(writer->Put("foo2", "bar2")); + ASSERT_OK(writer->Finish()); + writer.reset(); + + std::unique_ptr db; + options.create_if_missing = true; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + ifo.allow_db_generated_files = false; + ifo.fill_cache = false; + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_OK(s); + + std::unique_ptr iter(db->NewIterator({}, cfh)); + ASSERT_NE(iter, nullptr); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid() && iter->status().ok()); + ASSERT_EQ(iter->value(), "bar"); + iter->Next(); + ASSERT_TRUE(iter->Valid() && iter->status().ok()); + ASSERT_EQ(iter->key(), "foo2"); + ASSERT_EQ(iter->value(), "bar2"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(); + + // Create an overlapping file to ingest with atomic_replace_range option + ingest_file += "2"; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file)); + ASSERT_OK(writer->Put("foo", "val")); + ASSERT_OK(writer->Put("foo2", "val2")); + ASSERT_OK(writer->Finish()); + writer.reset(); + + ifo.snapshot_consistency = false; + s = db->IngestExternalFiles({{cfh, + {ingest_file}, + ifo, + {}, + {}, + Temperature::kUnknown, + {{nullptr, nullptr}}}}); + ASSERT_OK(s); + + iter.reset(db->NewIterator({}, cfh)); + ASSERT_NE(iter, nullptr); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid() && iter->status().ok()); + ASSERT_EQ(iter->value(), "val"); + iter->Next(); + ASSERT_TRUE(iter->Valid() && iter->status().ok()); + ASSERT_EQ(iter->key(), "foo2"); + ASSERT_EQ(iter->value(), "val2"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(); + + // Create an overlapping file to ingest without atomic_replace_range option. + // This should fail as we don't support ingesting an external file with + // non-zero assigned sequence number. + ingest_file += "3"; + writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(writer->Open(ingest_file)); + ASSERT_OK(writer->Put("foo", "newval")); + ASSERT_OK(writer->Put("foo2", "newval2")); + ASSERT_OK(writer->Finish()); + writer.reset(); + + s = db->IngestExternalFiles( + {{cfh, {ingest_file}, ifo, {}, {}, Temperature::kUnknown, {}}}); + ASSERT_EQ(s, Status::NotSupported()); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); +} + +class UserDefinedIndexTestBase : public BlockBasedTableTestBase { + public: + class CustomFlushBlockPolicy : public FlushBlockPolicy { + public: + explicit CustomFlushBlockPolicy(int keys_per_block) + : keys_in_current_block_(0), keys_per_block_(keys_per_block) {} + + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (keys_in_current_block_ >= keys_per_block_) { + keys_in_current_block_ = 1; + return true; + } + keys_in_current_block_++; + return false; + } + + private: + int keys_in_current_block_; + int keys_per_block_; + }; + + class CustomFlushBlockPolicyFactory : public FlushBlockPolicyFactory { + public: + CustomFlushBlockPolicyFactory(int keys_per_block = 3) + : keys_per_block_(keys_per_block) {} + const char* Name() const override { return "CustomFlushBlockPolicy"; } + FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&, + const BlockBuilder&) const override { + return new CustomFlushBlockPolicy(keys_per_block_); + } + int keys_per_block_; + }; + + public: + class TestUserDefinedIndexFactory : public UserDefinedIndexFactory { + public: + const char* Name() const override { return "test_index"; } + Status NewBuilder( + const UserDefinedIndexOption& /*option*/, + std::unique_ptr& builder) const override { + builder = std::make_unique(); + return Status::OK(); + } + + struct CustomizedMapComparator { + CustomizedMapComparator(const Comparator* _comparator) + : comparator(_comparator) {} + const Comparator* comparator; + bool operator()(const std::string& lhs, const std::string& rhs) const { + return comparator->Compare(lhs, rhs) < 0; + } + }; + + // Deprecated API + UserDefinedIndexBuilder* NewBuilder() const override { return nullptr; } + + std::unique_ptr NewReader( + Slice& /*index_block*/) const override { + return nullptr; + } + + Status NewReader( + const UserDefinedIndexOption& option, Slice& index_block, + std::unique_ptr& reader) const override { + reader = std::make_unique( + index_block, option.comparator, this); + return Status::OK(); + } + + uint64_t seek_error_count_ = 0; + uint64_t next_error_count_ = 0; + + private: + class TestUserDefinedIndexBuilder : public UserDefinedIndexBuilder { + public: + TestUserDefinedIndexBuilder() : entries_added_(0), keys_added_(0) {} + + Slice AddIndexEntry(const Slice& last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle, + std::string* separator_scratch) override { + if (keys_added_ == 0) { + return last_key_in_current_block; + } + EXPECT_EQ(last_key_in_current_block.size(), 5); + if (first_key_in_next_block) { + EXPECT_EQ(first_key_in_next_block->size(), 5); + } + // Unused parameters + (void)separator_scratch; + entries_added_++; + index_data_[last_key_in_current_block.ToString()].clear(); + // Store the block handle for each key + PutFixed64(&index_data_[last_key_in_current_block.ToString()], + block_handle.offset); + PutFixed64(&index_data_[last_key_in_current_block.ToString()], + block_handle.size); + PutFixed32(&index_data_[last_key_in_current_block.ToString()], + keys_added_); + keys_added_ = 0; + return last_key_in_current_block; + } + + void OnKeyAdded(const Slice& key, ValueType /*value*/, + const Slice& /*value*/) override { + if (key.starts_with("dummy")) { + return; + } + EXPECT_EQ(key.size(), 5); + // Track keys added to the index + keys_added_++; + // Add dummy entry + PutFixed64(&index_data_[key.ToString()], 0); + PutFixed64(&index_data_[key.ToString()], 0); + PutFixed32(&index_data_[key.ToString()], 0); + } + + Status Finish(Slice* index_contents) override { + if (entries_added_ == 0) { + *index_contents = Slice(); + return Status::OK(); + } + // Serialize the index data + std::string result; + for (const auto& entry : index_data_) { + PutLengthPrefixedSlice(&result, entry.first); + result.append(entry.second); + } + index_contents_data_ = result; + *index_contents = index_contents_data_; + return Status::OK(); + } + + int GetEntriesAdded() const { return entries_added_; } + + private: + int entries_added_; + std::map index_data_; + uint32_t keys_added_; + std::string index_contents_data_; + }; + + class TestUserDefinedIndexReader : public UserDefinedIndexReader { + public: + explicit TestUserDefinedIndexReader( + Slice& index_block, const Comparator* comparator, + const TestUserDefinedIndexFactory* factory) + : factory_(factory), + comparator_(comparator), + index_data_(CustomizedMapComparator(comparator)) { + Slice block = index_block; + while (!block.empty()) { + Slice key; + uint64_t offset = 0; + uint64_t size = 0; + uint32_t num_keys = 0; + EXPECT_TRUE(GetLengthPrefixedSlice(&block, &key)); + EXPECT_TRUE(GetFixed64(&block, &offset)); + EXPECT_TRUE(GetFixed64(&block, &size)); + EXPECT_TRUE(GetFixed32(&block, &num_keys)); + + UserDefinedIndexBuilder::BlockHandle handle{0, 0}; + handle.offset = offset; + handle.size = size; + index_data_[key.ToString()] = + std::make_pair( + std::move(handle), std::move(num_keys)); + } + } + + std::unique_ptr NewIterator( + const ReadOptions& /*ro*/) override { + return std::make_unique( + index_data_, factory_, comparator_); + } + + size_t ApproximateMemoryUsage() const override { return 0; } + + private: + class TestUserDefinedIndexIterator : public UserDefinedIndexIterator { + public: + TestUserDefinedIndexIterator( + std::map, + CustomizedMapComparator>& index, + const TestUserDefinedIndexFactory* factory, + const Comparator* comparator) + : index_(index), + iter_(index_.end()), + scan_opts_(nullptr), + num_opts_(0), + target_num_keys_(0), + seek_error_count_(factory->seek_error_count_), + next_error_count_(factory->next_error_count_), + comparator_(comparator) {} + + Status SeekAndGetResult(const Slice& key, + IterateResult* result) override { + Status s; + if (seek_error_count_) { + seek_error_count_--; + s = Status::IOError(); + } + if (!s.ok()) { + return s; + } + if (scan_opts_) { + // Seeks should be in order specified in scan_opts_ + EXPECT_EQ(comparator_->Compare( + scan_opts_[scan_idx_].range.start.value(), key), + 0); + EXPECT_TRUE(scan_opts_[scan_idx_].property_bag.has_value()); + target_num_keys_ = std::stoi(scan_opts_[scan_idx_] + .property_bag.value() + .find("count") + ->second); + scan_idx_++; + } + iter_ = index_.lower_bound(key.ToString()); + if ((iter_ != index_.end()) && IsInbound()) { + AdvanceToNextIndexEntry(); + result->bound_check_result = IterBoundCheck::kInbound; + result->key = Slice(iter_->first); + if (scan_opts_ && target_num_keys_ > 0 && + comparator_->Compare(key, iter_->first) == 0) { + target_num_keys_--; + } + } else { + result->bound_check_result = IterBoundCheck::kOutOfBound; + result->key = Slice(); + } + return Status::OK(); + } + + Status NextAndGetResult(IterateResult* result) override { + Status s; + if (next_error_count_) { + next_error_count_--; + s = Status::IOError(); + } + if (!s.ok()) { + return s; + } + if (scan_opts_ && scan_opts_[scan_idx_ - 1].range.limit.has_value()) { + if (comparator_->Compare( + iter_->first, + scan_opts_[scan_idx_ - 1].range.limit.value()) >= 0) { + result->bound_check_result = IterBoundCheck::kOutOfBound; + result->key = Slice(); + return Status::OK(); + } + } + if (scan_opts_ && target_num_keys_ == 0) { + result->key = Slice(); + result->bound_check_result = IterBoundCheck::kOutOfBound; + return Status::OK(); + } + iter_++; + if ((iter_ != index_.end()) && IsInbound()) { + AdvanceToNextIndexEntry(); + result->bound_check_result = IterBoundCheck::kInbound; + result->key = Slice(iter_->first); + target_num_keys_ -= + std::min(target_num_keys_, iter_->second.second); + } else { + // EOF + result->bound_check_result = IterBoundCheck::kUnknown; + result->key = Slice(); + } + return Status::OK(); + } + + void AdvanceToNextIndexEntry() { + while (iter_->second.second == 0) { + iter_++; + } + } + + bool IsInbound() { + if (!scan_opts_) { + return true; + } + if (scan_opts_[scan_idx_ - 1].range.limit.has_value() && + comparator_->Compare( + scan_opts_[scan_idx_ - 1].range.limit.value(), + iter_->first) <= 0) { + return false; + } + return true; + } + + UserDefinedIndexBuilder::BlockHandle value() override { + UserDefinedIndexBuilder::BlockHandle handle{0, 0}; + handle.offset = iter_->second.first.offset; + handle.size = iter_->second.first.size; + return handle; + } + + void Prepare(const ScanOptions scan_opts[], size_t num_opts) override { + // Prepare should only be called once + EXPECT_EQ(scan_opts_, nullptr); + scan_opts_ = scan_opts; + num_opts_ = num_opts; + scan_idx_ = 0; + } + + private: + std::map, + CustomizedMapComparator>& index_; + std::map>::iterator iter_; + const ScanOptions* scan_opts_; + size_t num_opts_{}; + size_t scan_idx_{}; + uint32_t target_num_keys_; + uint64_t seek_error_count_; + uint64_t next_error_count_; + const Comparator* comparator_; + }; + + const TestUserDefinedIndexFactory* factory_; + const Comparator* comparator_; + std::map, + CustomizedMapComparator> + index_data_; + }; + }; + + protected: + std::vector> generateKVWithValue( + int key_count, const std::string& value) { + std::vector> kvs(key_count); + for (int i = 0; i < key_count; i++) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + std::string key = "key" + ss.str(); + kvs[i] = std::make_pair(key, value); + } + if (is_reverse_comparator_) { + std::reverse(kvs.begin(), kvs.end()); + } + return kvs; + } + + std::vector> generateKVs( + int key_count, int value_size = 0) { + std::vector> kvs(key_count); + for (int i = 0; i < key_count; i++) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + std::string key = "key" + ss.str(); + std::string value; + if (value_size != 0) { + value = rnd.RandomString(1024); + } else { + value = "value" + ss.str(); + } + kvs[i] = std::make_pair(key, value); + } + if (is_reverse_comparator_) { + std::reverse(kvs.begin(), kvs.end()); + } + return kvs; + } + + void BasicTest(bool use_partitioned_index); + + void ValidateMultiScan( + std::vector, int, int>> + scan_opt_validation_arg, + std::unordered_map property_bag, + const ReadOptions& ro, MultiScanArgs& scan_opts, + std::vector& key_counts, std::unique_ptr& db, + ColumnFamilyHandle* cfh) { + key_counts.clear(); + (*scan_opts).clear(); + + if (is_reverse_comparator_) { + for (auto& scan_opt_validation_range : scan_opt_validation_arg) { + // reverse each range + std::reverse(std::get<0>(scan_opt_validation_range).begin(), + std::get<0>(scan_opt_validation_range).end()); + } + // reverse all the ranges + std::reverse(scan_opt_validation_arg.begin(), + scan_opt_validation_arg.end()); + } + + for (auto& scan_opt_validation_range : scan_opt_validation_arg) { + scan_opts.insert(std::get<0>(scan_opt_validation_range)[0], + std::get<0>(scan_opt_validation_range)[1], + std::optional(property_bag)); + if (is_reverse_comparator_) { + key_counts.push_back(std::get<2>(scan_opt_validation_range)); + } else { + key_counts.push_back(std::get<1>(scan_opt_validation_range)); + } + } + + Slice ub; + ReadOptions read_opts = ro; + int key_count = 0; + int index = 0; + auto opts = scan_opts.GetScanRanges(); + read_opts.iterate_upper_bound = &ub; + std::unique_ptr iter(db->NewIterator(read_opts, cfh)); + iter->Prepare(scan_opts); + for (auto opt : opts) { + ub = opt.range.limit.value(); + iter->Seek(opt.range.start.value()); + if (kVerbose) { + printf("range start key %s, end key %s\n", + opt.range.start.value().ToString().c_str(), + opt.range.limit.value().ToString().c_str()); + } + EXPECT_OK(iter->status()); + while (iter->Valid()) { + if (kVerbose) { + printf("found key %s\n", iter->key().ToString().c_str()); + } + key_count++; + iter->Next(); + } + EXPECT_EQ(key_count, key_counts[index]); + key_count = 0; + index++; + } + EXPECT_OK(iter->status()); + } + Options options_; + const Comparator* comparator_; + bool is_reverse_comparator_; + Random rnd{301}; +}; + +class UserDefinedIndexTest + : public UserDefinedIndexTestBase, + public testing::WithParamInterface { + void SetUp() override { + comparator_ = GetParam(); + options_.comparator = comparator_; + is_reverse_comparator_ = comparator_ == ReverseBytewiseComparator(); + } +}; + +void UserDefinedIndexTestBase::BasicTest(bool use_partitioned_index) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + if (use_partitioned_index) { + table_options.partition_filters = true; + table_options.decouple_partitioned_filters = true; + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + } + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + auto kvs = generateKVs(/*key_count*/ 100); + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + + ImmutableOptions ioptions(options_); + MutableCFOptions moptions((ColumnFamilyOptions(options_))); + EnvOptions eoptions(options_); + TableReaderOptions toptions( + ioptions, moptions.prefix_extractor, + /*_compression_manager=*/nullptr, eoptions, ioptions.internal_comparator, + moptions.block_protection_bytes_per_key, + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, /*cur_db_session_id*/ "", + /*cur_file_num*/ 0, + /* unique_id */ {}, /* largest_seqno */ 0, + /* tail_size */ 0, ioptions.persist_user_defined_timestamps); + // Verify that the user-defined index was created + std::string meta_block_name = kUserDefinedIndexPrefix + "test_index"; + BlockHandle block_handle; + uint64_t file_size = 0; + std::unique_ptr file; + std::unique_ptr file_reader; + const auto& fs = options_.env->GetFileSystem(); + ASSERT_OK(fs->GetFileSize(ingest_file, IOOptions(), &file_size, nullptr)); + ASSERT_OK(fs->NewRandomAccessFile(ingest_file, eoptions, &file, nullptr)); + file_reader.reset(new RandomAccessFileReader(std::move(file), ingest_file)); + ASSERT_OK(FindMetaBlockInFile(file_reader.get(), file_size, + kBlockBasedTableMagicNumber, ioptions, + ReadOptions(), meta_block_name, &block_handle)); + file_reader.reset(); + // With our custom flush policy that flushes every 3 keys, + // we expect around 34 data blocks (100/3 rounded up) + // Verify the number of entries in the user-defined index + // Each data block should have an entry in the index + // With our flush policy of 3 keys per block, we expect around 34 entries + int expected_entries = (100 + 2) / 3; // Ceiling of 100/3 + ASSERT_GE(block_handle.size(), + expected_entries); // At least this many entries + + std::unique_ptr reader(new SstFileReader(options_)); + ASSERT_OK(reader->Open(ingest_file)); + + ReadOptions ro; + std::unique_ptr iter(reader->NewIterator(ro)); + ASSERT_NE(iter, nullptr); + + // Test that we can read all the keys + int key_count = 0; + for (iter->SeekToFirst(); iter->Valid() && iter->status().ok(); + iter->Next()) { + key_count++; + } + ASSERT_EQ(key_count, 100); // We added 100 keys + ASSERT_OK(iter->status()); + iter.reset(); + + ro.table_index_factory = user_defined_index_factory.get(); + iter.reset(reader->NewIterator(ro)); + ASSERT_NE(iter, nullptr); + + // Test seek specific key + key_count = 0; + for (iter->Seek("key40"); iter->Valid(); iter->Next()) { + key_count++; + } + ASSERT_EQ(key_count, is_reverse_comparator_ ? 41 : 60); + ASSERT_OK(iter->status()); + + // Test upper bound + Slice ub(is_reverse_comparator_ ? "key25" : "key75"); + ro.iterate_upper_bound = &ub; + iter.reset(reader->NewIterator(ro)); + ASSERT_NE(iter, nullptr); + + // Test seek specific key with upper bound + key_count = 0; + for (iter->Seek("key40"); iter->Valid(); iter->Next()) { + key_count++; + } + ASSERT_EQ(key_count, is_reverse_comparator_ ? 15 : 35); + ASSERT_OK(iter->status()); + + user_defined_index_factory->seek_error_count_ = 1; + iter.reset(reader->NewIterator(ro)); + ASSERT_NE(iter, nullptr); + iter->Seek("key40"); + ASSERT_NOK(iter->status()); + + user_defined_index_factory->seek_error_count_ = 0; + user_defined_index_factory->next_error_count_ = 1; + iter.reset(reader->NewIterator(ro)); + ASSERT_NE(iter, nullptr); + iter->Seek(is_reverse_comparator_ ? "key92" : "key09"); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_OK(iter->status()); + iter->Next(); + if (!is_reverse_comparator_) { + ASSERT_OK(iter->status()); + iter->Next(); + } + ASSERT_NOK(iter->status()); + user_defined_index_factory->next_error_count_ = 0; + + ro.iterate_upper_bound = &ub; + iter.reset(reader->NewIterator(ro)); + ASSERT_NE(iter, nullptr); + MultiScanArgs scan_opts(comparator_); + + std::unordered_map property_bag; + property_bag["count"] = std::to_string(25); + std::vector boundaries = {"key10", "key50"}; + if (is_reverse_comparator_) { + std::reverse(boundaries.begin(), boundaries.end()); + } + + scan_opts.insert(boundaries[0], boundaries[1], std::optional(property_bag)); + iter->Prepare(scan_opts); + // Test that UDI is used to help fetch the number of keys + key_count = 0; + ub = boundaries[1]; + for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value()); + iter->Valid(); iter->Next()) { + key_count++; + } + // The index may undercount by 2 blocks + ASSERT_EQ(key_count, 29); + ASSERT_OK(iter->status()); +} + +TEST_P(UserDefinedIndexTest, BasicTestWithPartitionedIndex) { + BasicTest(/*use_partitioned_index=*/true); +} + +TEST_P(UserDefinedIndexTest, BasicTestWithoutPartitionedIndex) { + BasicTest(/*use_partitioned_index=*/false); +} + +TEST_P(UserDefinedIndexTest, InvalidArgumentTest1) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options_.compression_opts.parallel_threads = 10; + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + std::string key = "foo"; + std::string value = "bar"; + ASSERT_EQ(writer->Put(key, value), Status::InvalidArgument()); + ASSERT_EQ(writer->Finish(), Status::InvalidArgument()); + writer.reset(); +} + +TEST_P(UserDefinedIndexTest, InvalidArgumentTest2) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + std::string key = "foo"; + std::string value = "bar"; + ASSERT_OK(writer->Merge(key, value)); + ASSERT_EQ(writer->Finish(), Status::InvalidArgument()); + writer.reset(); +} + +TEST_P(UserDefinedIndexTest, IngestTest) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + auto kvs = generateKVs(/*key_count*/ 100); + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + + ASSERT_OK(writer->Finish()); + writer.reset(); + + std::unique_ptr db; + options_.create_if_missing = true; + Status s = DB::Open(options_, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_OK(s); + + ReadOptions ro; + std::unique_ptr iter(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + ASSERT_OK(iter->status()); + + // Test that we can read all the keys + int key_count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + key_count++; + } + ASSERT_EQ(key_count, 100); // We added 100 keys + ASSERT_OK(iter->status()); + iter.reset(); + + ro.table_index_factory = user_defined_index_factory.get(); + iter.reset(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + + // Test seek specific key + key_count = 0; + for (iter->Seek("key40"); iter->Valid(); iter->Next()) { + key_count++; + } + ASSERT_EQ(key_count, is_reverse_comparator_ ? 41 : 60); + ASSERT_OK(iter->status()); + + // Test upper bound + Slice ub(is_reverse_comparator_ ? "key25" : "key75"); + ro.iterate_upper_bound = &ub; + iter.reset(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + + // Test seek specific key with upper bound + key_count = 0; + for (iter->Seek("key40"); iter->Valid(); iter->Next()) { + key_count++; + } + ASSERT_EQ(key_count, is_reverse_comparator_ ? 15 : 35); + ASSERT_OK(iter->status()); + iter.reset(); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, options_)); +} + +TEST_P(UserDefinedIndexTest, EmptyRangeTest) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + // Generate key range key0 ~ key19, key40 ~ key59, key80 ~ key99 + std::vector> kvs; + bool skip = false; + for (int i = 0; i < 100; i++) { + if (i > 0 && i % 20 == 0) { + skip = !skip; + } + if (skip) { + continue; + } + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + std::string key = "key" + ss.str(); + std::string value = "value" + ss.str(); + kvs.emplace_back(key, value); + } + + if (is_reverse_comparator_) { + std::reverse(kvs.begin(), kvs.end()); + } + + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + + std::unique_ptr db; + options_.create_if_missing = true; + Status s = DB::Open(options_, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_OK(s); + + ReadOptions ro; + std::unique_ptr iter(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + ASSERT_OK(iter->status()); + + // Test that we can read all the keys + int key_count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + key_count++; + } + ASSERT_EQ(key_count, 60); + ASSERT_OK(iter->status()); + iter.reset(); + + ro.table_index_factory = user_defined_index_factory.get(); + std::vector key_counts; + MultiScanArgs scan_opts(options_.comparator); + std::unordered_map property_bag; + property_bag["count"] = std::to_string(5); + + ValidateMultiScan({{{"key25", "key30"}, 0, 0}, + {{"key33", "key37"}, 0, 0}, + // Non-empty scan with range greater than count + // In the key42:key56 range, we might read an additional + // block worth of keys due to the boundaries (5 + 3) + {{"key42", "key56"}, 8, 7}, + // Empty scan succeeding a non-empty one + {{"key65", "key70"}, 0, 0}, + // A non-empty scan with range smaller than count + {{"key85", "key87"}, 2, 2}, + // Scan range completely outside the DB + {{"key991", "key999"}, 0, 0}}, + property_bag, ro, scan_opts, key_counts, db, cfh); + + // Scans that overlap with part of key range, with overlap less than count + ValidateMultiScan({{{"key18", "key25"}, 2, 1}, {{"key38", "key43"}, 3, 4}}, + property_bag, ro, scan_opts, key_counts, db, cfh); + + // Scans that overlap with part of key range, with overlap same as count + ValidateMultiScan({{{"key15", "key26"}, 5, 4}, {{"key38", "key46"}, 6, 7}}, + property_bag, ro, scan_opts, key_counts, db, cfh); + + // Scans that overlap with part of key range, with overlap greater than count + ValidateMultiScan({{{"key10", "key26"}, 8, 8}, + // Cross block boundary + {{"key38", "key49"}, 7, 9}}, + property_bag, ro, scan_opts, key_counts, db, cfh); + + // Scan bigger than one contiguous range of keys, with overlap greater than + // count + ValidateMultiScan({{{"key75", "key991"}, 8, 9}}, property_bag, ro, scan_opts, + key_counts, db, cfh); + + // Scan bigger than one contiguous range of keys, with overlap less than count + property_bag["count"] = std::to_string(25); + ValidateMultiScan({{{"key75", "key991"}, 20, 20}}, property_bag, ro, + scan_opts, key_counts, db, cfh); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, options_)); +} + +// Verify that external file ingestion fails if we try to ingest an SST file +// without the UDI and a UDI factory is configured in BlockBasedTableOptions +// and fail_if_no_udi_on_open is true in BlockBasedTableOptions. +TEST_P(UserDefinedIndexTest, IngestFailTest) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + auto kvs = generateKVs(/*key_count*/ 100); + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + table_options.fail_if_no_udi_on_open = true; + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr db; + options_.create_if_missing = true; + Status s = DB::Open(options_, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_NOK(s); + + ASSERT_OK(db->SetOptions( + cfh, {{"block_based_table_factory", "{fail_if_no_udi_on_open=false;}"}})); + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_OK(s); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, options_)); +} + +TEST_P(UserDefinedIndexTest, IngestEmptyUDI) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + std::string ingest_file2 = dbname + "dummy.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + auto kvs = generateKVs(/*key_count*/ 100); + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file2)); + ASSERT_OK(writer->Put("dummy", "val")); + ASSERT_OK(writer->Finish()); + writer.reset(); + + table_options.fail_if_no_udi_on_open = true; + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr db; + options_.create_if_missing = true; + Status s = DB::Open(options_, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh)); + + std::vector ifa; + ifa.emplace_back(); + ifa[0].column_family = cfh; + ifa[0].external_files.emplace_back(ingest_file); + ifa[0].external_files.emplace_back(ingest_file2); + s = db->IngestExternalFiles(ifa); + ASSERT_OK(s); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, options_)); +} + +TEST_P(UserDefinedIndexTest, MultiScanFailureTest) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + // Use bigger value, so that prefetch size limit will be effective + auto kvs = generateKVs(/*key_count*/ 100, /* value_size */ 1024); + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + + std::unique_ptr db; + options_.create_if_missing = true; + Status s = DB::Open(options_, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_OK(s); + + std::vector key_ranges({"key03", "key05", "key12", "key14"}); + ReadOptions ro; + ro.table_index_factory = user_defined_index_factory.get(); + Slice ub; + ro.iterate_upper_bound = &ub; + std::unordered_map property_bag; + property_bag["count"] = std::to_string(5); + MultiScanArgs scan_options(comparator_); + if (is_reverse_comparator_) { + std::reverse(key_ranges.begin(), key_ranges.end()); + } + scan_options.insert(key_ranges[0], key_ranges[1], property_bag); + scan_options.insert(key_ranges[2], key_ranges[3], property_bag); + scan_options.max_prefetch_size = 3500; + std::unique_ptr iter(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + iter->Prepare(scan_options); + int count = 0; + ub = key_ranges[1]; + iter->Seek(key_ranges[0]); + while (iter->status().ok() && iter->Valid()) { + ASSERT_GE(comparator_->Compare(iter->key(), key_ranges[0]), 0); + ASSERT_LT(comparator_->Compare(iter->key(), key_ranges[1]), 0); + count++; + iter->Next(); + } + ASSERT_OK(iter->status()) << iter->status().ToString(); + ASSERT_EQ(count, 2); + + ub = key_ranges[3]; + iter->Seek(key_ranges[2]); + // This should fail due to reaching max_prefetch_size limit + ASSERT_EQ(iter->status(), Status::Incomplete()); + iter.reset(); + + // Empty range multiscan error + iter.reset(db->NewIterator(ro, cfh)); + scan_options = MultiScanArgs(comparator_); + iter->Prepare(scan_options); + ASSERT_EQ(iter->status(), Status::InvalidArgument("Empty MultiScanArgs")); + + // Check no seek key error + iter.reset(db->NewIterator(ro, cfh)); + scan_options = MultiScanArgs(comparator_); + scan_options.insert(key_ranges[0], key_ranges[2], property_bag); + iter->Prepare(scan_options); + iter->SeekToFirst(); + ASSERT_EQ(iter->status(), + Status::InvalidArgument("No seek key for MultiScan")); + + // Seek is not allowed to seen a key that is not following the prepare order + iter.reset(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + scan_options.max_prefetch_size = 0; + iter->Prepare(scan_options); + ub = key_ranges[3]; + iter->Seek(key_ranges[2]); + ASSERT_EQ( + iter->status(), + Status::InvalidArgument( + "Seek target does not match the start of the next prepared range at " + "index 0")); + ASSERT_FALSE(iter->Valid()); + iter.reset(); + + // limit is equal to start error + iter.reset(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + (*scan_options).clear(); + scan_options.insert(key_ranges[0], key_ranges[0], property_bag); + iter->Prepare(scan_options); + ASSERT_EQ(iter->status(), + Status::InvalidArgument( + "Scan start key is large or equal than limit at index 0")); + iter.reset(); + + // overlapping ranges error + iter.reset(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + (*scan_options).clear(); + scan_options.insert(key_ranges[0], key_ranges[2], property_bag); + scan_options.insert(key_ranges[1], key_ranges[3], property_bag); + iter->Prepare(scan_options); + ASSERT_EQ(iter->status(), + Status::InvalidArgument("Overlapping ranges at index 1")); + iter.reset(); + + // Validate an error is returned if upper bound is not set to the same value + // as limit + iter.reset(db->NewIterator(ro, cfh)); + scan_options = MultiScanArgs(comparator_); + scan_options.insert(key_ranges[0], key_ranges[1], property_bag); + iter->Prepare(scan_options); + ub = ""; + iter->Seek(key_ranges[0]); + ASSERT_EQ(iter->status(), + Status::InvalidArgument( + "Upper bound is not set to the same limit value of the next " + "prepared range at index 0")); + ASSERT_FALSE(iter->Valid()); + + // Validate an error is returned when seek more keys than prepared + iter.reset(db->NewIterator(ro, cfh)); + scan_options = MultiScanArgs(comparator_); + scan_options.insert(key_ranges[0], key_ranges[1], property_bag); + iter->Prepare(scan_options); + ub = key_ranges[1]; + iter->Seek(key_ranges[0]); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + iter->Seek(key_ranges[2]); + ASSERT_EQ(iter->status(), + Status::InvalidArgument( + "Seek called after exhausting all of the scan ranges")); + ASSERT_FALSE(iter->Valid()); + iter.reset(); + + // Check error is returned if upper bound is not set and limit is set + ro.iterate_upper_bound = nullptr; + iter.reset(db->NewIterator(ro, cfh)); + scan_options = MultiScanArgs(comparator_); + scan_options.insert(key_ranges[0], key_ranges[1], property_bag); + iter->Prepare(scan_options); + iter->Seek(key_ranges[0]); + ASSERT_EQ(iter->status(), + Status::InvalidArgument( + "Upper bound is not set to the same limit value of the next " + "prepared range at index 0")); + ASSERT_FALSE(iter->Valid()); + iter.reset(); + + // Upper bound is allowed to be empty, if limit is not set + ro.iterate_upper_bound = nullptr; + iter.reset(db->NewIterator(ro, cfh)); + scan_options = MultiScanArgs(comparator_); + scan_options.insert(key_ranges[0], property_bag); + iter->Prepare(scan_options); + iter->Seek(key_ranges[0]); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + iter.reset(); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, options_)); +} + +TEST_P(UserDefinedIndexTest, ConfigTest) { + BlockBasedTableOptions table_options; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file)); + + auto kvs = generateKVs(/*key_count*/ 100); + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + + table_options.user_defined_index_factory.reset(); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + // Set up the user-defined index factory + ObjectLibrary::Default().get()->AddFactory( + "test_index", [](const std::string& /* uri */, + std::unique_ptr* guard, + std::string* /* errmsg */) { + auto factory = new TestUserDefinedIndexFactory(); + guard->reset(factory); + return guard->get(); + }); + ASSERT_OK(GetColumnFamilyOptionsFromString( + ConfigOptions(), options_, + "block_based_table_factory={user_defined_index_factory=test_index;}", + &options_)); + + std::unique_ptr db; + options_.create_if_missing = true; + Status s = DB::Open(options_, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + s = db->IngestExternalFile(cfh, {ingest_file}, ifo); + ASSERT_OK(s); + + ReadOptions ro; + Slice ub; + ro.iterate_upper_bound = &ub; + ro.table_index_factory = user_defined_index_factory.get(); + std::unique_ptr iter(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + MultiScanArgs scan_opts(options_.comparator); + std::unordered_map property_bag; + property_bag["count"] = std::to_string(25); + + std::vector boundaries = {"key10", "key50"}; + if (is_reverse_comparator_) { + std::reverse(boundaries.begin(), boundaries.end()); + } + + scan_opts.insert(boundaries[0], boundaries[1], std::optional(property_bag)); + iter->Prepare(scan_opts); + // Test that UDI is used to help fetch the number of keys + ub = boundaries[1]; + int key_count = 0; + for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value()); + iter->Valid(); iter->Next()) { + key_count++; + } + // Number of blocks prepared is based on UDI, it would be slightly higher than + // the limit + // The index may undercount by 2 blocks + ASSERT_EQ(key_count, 29); + ASSERT_OK(iter->status()); + iter.reset(); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, options_)); +} + +TEST_P(UserDefinedIndexTest, RangeDelete) { + BlockBasedTableOptions table_options; + options_.num_levels = 50; + options_.compaction_style = kCompactionStyleUniversal; + options_.disable_auto_compactions = true; + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + auto create_ingestion_data_file = [&](const std::string& filename) { + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(filename)); + auto kvs = generateKVs(100); + + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + }; + + // Create first ingestion file with data + create_ingestion_data_file(ingest_file + "_0"); + + // Create second ingestion file with range delete only that covers the first + // file to delete all of its keys. + { + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file + "_1")); + if (is_reverse_comparator_) { + ASSERT_OK(writer->DeleteRange("keyz", "key")); + } else { + ASSERT_OK(writer->DeleteRange("key", "keyz")); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + } + + // Create the second ingestion file with data + create_ingestion_data_file(ingest_file + "_2"); + + std::unique_ptr db; + options_.create_if_missing = true; + Status s = DB::Open(options_, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + // ingest first data file key00~key99 + s = db->IngestExternalFile(cfh, {ingest_file + "_0"}, ifo); + ASSERT_OK(s); + // ingest delete range (key-keyz) and new data file (key00-key99) together + s = db->IngestExternalFile(cfh, {ingest_file + "_1", ingest_file + "_2"}, + ifo); + ASSERT_OK(s); + + std::vector range = { + Slice("key10"), + Slice("key25"), + Slice("key80"), + Slice("key95"), + }; + + if (is_reverse_comparator_) { + std::reverse(range.begin(), range.end()); + } + + Slice ub(""); + ReadOptions ro; + ro.iterate_upper_bound = &ub; + std::unique_ptr iter(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + + MultiScanArgs scan_opts(options_.comparator); + std::unordered_map property_bag; + property_bag["count"] = std::to_string(9); + + std::vector> decoded_ranges; + for (size_t i = 0; i < range.size() / 2; i++) { + scan_opts.insert(range[i * 2], range[i * 2 + 1], + std::optional(property_bag)); + } + iter->Prepare(scan_opts); + + for (size_t i = 0; i < range.size() / 2; i++) { + // Update upper bound before each seek + ub = range[2 * i + 1]; + auto key_count = 0; + for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) { + key_count++; + } + ASSERT_OK(iter->status()); + ASSERT_EQ(key_count, 15); + } + + iter.reset(); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, options_)); +} + +TEST_P(UserDefinedIndexTest, QueryCrossTwoFiles) { + BlockBasedTableOptions table_options; + options_.num_levels = 50; + options_.compaction_style = kCompactionStyleUniversal; + options_.disable_auto_compactions = true; + options_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(4); + std::string dbname = test::PerThreadDBPath("user_defined_index_test"); + std::string ingest_file = dbname + "test.sst"; + + // Set up the user-defined index factory + auto user_defined_index_factory = + std::make_shared(); + table_options.user_defined_index_factory = user_defined_index_factory; + + // Set up custom flush block policy that flushes every 3 keys + table_options.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + auto create_ingestion_data_file = [&](const std::string& filename, + const std::string& value) { + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(filename)); + auto kvs = generateKVWithValue(100, value); + + for (const auto& kv : kvs) { + ASSERT_OK(writer->Put(kv.first, kv.second)); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + }; + + // Create first ingestion file with data + create_ingestion_data_file(ingest_file + "_0", "old"); + + std::unique_ptr db; + options_.create_if_missing = true; + Status s = DB::Open(options_, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh)); + + IngestExternalFileOptions ifo; + // ingest data file key00~key99 + s = db->IngestExternalFile(cfh, {ingest_file + "_0"}, ifo); + ASSERT_OK(s); + + // Compact the file with SST partitioner, so that files are split into + // multiple ones + s = db->CompactRange( + {.exclusive_manual_compaction = true, + .bottommost_level_compaction = BottommostLevelCompaction::kForce}, + cfh, nullptr, nullptr); + ASSERT_OK(s); + + std::vector range = { + // Each range span across 2 files + Slice("key16"), + Slice("key24"), + Slice("key26"), + Slice("key34"), + }; + + if (is_reverse_comparator_) { + std::reverse(range.begin(), range.end()); + } + + Slice ub(""); + ReadOptions ro; + ro.iterate_upper_bound = &ub; + std::unique_ptr iter(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + + MultiScanArgs scan_opts(options_.comparator); + std::unordered_map property_bag; + auto read_key_per_range_limit = 2; + property_bag["count"] = std::to_string(read_key_per_range_limit); + + for (size_t i = 0; i < range.size() / 2; i++) { + scan_opts.insert(range[i * 2], range[i * 2 + 1], + std::optional(property_bag)); + } + iter->Prepare(scan_opts); + + for (size_t i = 0; i < range.size() / 2; i++) { + // Update upper bound before each seek + ub = range[2 * i + 1]; + auto key_count = 0; + for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) { + key_count++; + ASSERT_EQ(iter->value(), "old"); + if (key_count >= read_key_per_range_limit) { + break; + } + } + ASSERT_OK(iter->status()); + ASSERT_EQ(key_count, read_key_per_range_limit); + } + + // Create another ingestion file with range delete only that covers the first + // file to delete all of its keys. + { + std::unique_ptr writer; + writer.reset(new SstFileWriter(EnvOptions(), options_)); + ASSERT_OK(writer->Open(ingest_file + "_1")); + if (is_reverse_comparator_) { + ASSERT_OK(writer->DeleteRange("keyz", "key")); + } else { + ASSERT_OK(writer->DeleteRange("key", "keyz")); + } + ASSERT_OK(writer->Finish()); + writer.reset(); + } + s = db->IngestExternalFile(cfh, {ingest_file + "_1"}, ifo); + ASSERT_OK(s); + + // ingest new data + create_ingestion_data_file(ingest_file + "_2", "new"); + s = db->IngestExternalFile(cfh, {ingest_file + "_2"}, ifo); + ASSERT_OK(s); + + iter.reset(db->NewIterator(ro, cfh)); + ASSERT_NE(iter, nullptr); + ASSERT_OK(iter->status()); + + iter->Prepare(scan_opts); + + for (size_t i = 0; i < range.size() / 2; i++) { + // Update upper bound before each seek + ub = range[2 * i + 1]; + auto key_count = 0; + for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) { + key_count++; + ASSERT_EQ(iter->value(), "new"); + if (key_count >= read_key_per_range_limit) { + break; + } + } + ASSERT_OK(iter->status()); + ASSERT_EQ(key_count, read_key_per_range_limit); + } + + iter.reset(); + + ASSERT_OK(db->DestroyColumnFamilyHandle(cfh)); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, options_)); +} + +INSTANTIATE_TEST_CASE_P(UserDefinedIndexTest, UserDefinedIndexTest, + ::testing::Values(BytewiseComparator(), + ReverseBytewiseComparator())); + +struct UserDefinedIndexStressTestParam { + const Comparator* comparator; + bool enable_udi; + bool enable_compaction_with_sst_partitioner; + + using UserDefinedIndexStressTestTuple = + std::tuple; + + UserDefinedIndexStressTestParam(const UserDefinedIndexStressTestTuple& tuple) + : comparator(std::get<0>(tuple)), + enable_udi(std::get<1>(tuple)), + enable_compaction_with_sst_partitioner(std::get<2>(tuple)) {} +}; + +std::ostream& operator<<(std::ostream& os, + const UserDefinedIndexStressTestParam& param) { + return os << "UserDefinedIndexStressTestParam{comparator=" + << (param.comparator ? param.comparator->Name() : "nullptr") + << ", enable_udi=" << param.enable_udi + << ", enable_compaction_with_sst_partitioner=" + << param.enable_compaction_with_sst_partitioner << "}"; +} + +struct DataRange { + size_t start; // inclusive + size_t end; // exclusive + std::string value; + bool is_range_delete; + bool skipped; + size_t scan_key_count_limit; + std::string start_key; + std::string end_key; + + // print the range in human readable format + std::string ToString() const { + std::ostringstream oss; + oss << "[" << start << ", " << end << "), value: " << value + << ", is_range_delete: " << is_range_delete << ", skipped: " << skipped + << ", scan_key_count_limit: " << scan_key_count_limit + << ", start_key: " << start_key << ", end_key: " << end_key; + return oss.str(); + } +}; +class UserDefinedIndexStressTest + : public UserDefinedIndexTestBase, + public testing::WithParamInterface< + UserDefinedIndexStressTestParam::UserDefinedIndexStressTestTuple> { + public: + void SetUp() override { + rand_seed_ = static_cast( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count()); + + std::cout << "Random seed: " << rand_seed_ << std::endl; + + rnd = Random(rand_seed_); + UserDefinedIndexStressTestParam param = GetParam(); + comparator_ = param.comparator; + enable_udi_ = param.enable_udi; + enable_compaction_with_sst_partitioner_ = + param.enable_compaction_with_sst_partitioner; + options_.comparator = comparator_; + is_reverse_comparator_ = comparator_ == ReverseBytewiseComparator(); + options_.compaction_style = kCompactionStyleUniversal; + + // Set up custom flush block policy that flushes every 3 keys + table_options_.flush_block_policy_factory = + std::make_shared(); + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options_)); + } + + void TearDown() override { + ASSERT_OK(db_->DestroyColumnFamilyHandle(ingest_cfh_)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(regular_cfh_)); + + ASSERT_OK(db_->Close()); + ASSERT_OK(DestroyDB(dbname_, options_)); + } + + protected: + static constexpr auto kKeyRange = 100; + bool enable_udi_{}; + bool enable_compaction_with_sst_partitioner_{}; + uint32_t rand_seed_{}; + std::shared_ptr user_defined_index_factory_; + BlockBasedTableOptions table_options_; + const Comparator* comparator_{}; + bool is_reverse_comparator_{}; + Random rnd{0}; + ColumnFamilyHandle* ingest_cfh_ = nullptr; + ColumnFamilyHandle* regular_cfh_ = nullptr; + std::unique_ptr db_; + std::vector> ranges_in_levels_; + std::string dbname_; + + void SetupDB(const std::string& dbname) { + options_.create_if_missing = true; + options_.disable_auto_compactions = true; + Status s = DB::Open(options_, dbname, &db_); + ASSERT_OK(s); + ASSERT_TRUE(db_ != nullptr); + if (enable_compaction_with_sst_partitioner_) { + // Use a SST partitioner to create multiple files, use the first 4 bytes + // of key to partition the file, The key is formatted with 2 digit + // following "key" string, e.g. key01, key99 + options_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(4); + } + + ASSERT_OK(db_->CreateColumnFamily(options_, "regular_cf", ®ular_cfh_)); + + if (enable_udi_) { + // Set up the user-defined index factory + user_defined_index_factory_ = + std::make_shared(); + table_options_.user_defined_index_factory = user_defined_index_factory_; + } + + options_.table_factory.reset(NewBlockBasedTableFactory(table_options_)); + ASSERT_OK(db_->CreateColumnFamily(options_, "ingest_cf", &ingest_cfh_)); + } + + template + std::string FormatKey(T i) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i; + return "key" + ss.str(); + } + + std::vector GenerateKeyRanges(size_t range_count, + int skip_range_count, + const std::string& value) { + std::set boundaries; + // generate n + 1 number of unique boundaries to form n contiguoes ranges + while (boundaries.size() < range_count + 1) { + boundaries.insert(rnd.Uniform(kKeyRange)); + } + std::vector sorted_boundaries(boundaries.begin(), boundaries.end()); + if (is_reverse_comparator_) { + std::reverse(sorted_boundaries.begin(), sorted_boundaries.end()); + } + auto ranges = std::vector(); + std::optional prev_bound; + for (auto it = sorted_boundaries.begin(); it != sorted_boundaries.end(); + it++) { + if (prev_bound.has_value()) { + ranges.push_back({.start = prev_bound.value(), + .end = *it, + .value = value, + .is_range_delete = rnd.OneIn(6), + .skipped = false, + .scan_key_count_limit = rnd.Uniform(10) + 1, + .start_key = FormatKey(prev_bound.value()), + .end_key = FormatKey(*it)}); + } + prev_bound = *it; + } + // skipped some of them + for (int j = 0; j < skip_range_count; j++) { + ranges[rnd.Uniform(static_cast(range_count))].skipped = true; + } + + if (kVerbose) { + for (auto const& range : ranges) { + std::cout << range.ToString() << std::endl; + } + } + + return ranges; + } + + void CreateSstFileWithRanges(const std::string& ingest_file, + const std::vector& ranges, + bool& data_added) { + std::unique_ptr writer; + + data_added = false; + + std::vector ranges_in_file; + + for (auto const& range : ranges) { + assert(range.start != range.end); + if (range.skipped) { + continue; + } + + if (writer == nullptr) { + // lazy create writer until there is data to be written to avoid + // unchecked status error + writer = std::make_unique(EnvOptions(), options_); + ASSERT_OK(writer->Open(ingest_file)); + } + + ranges_in_file.push_back(range); + + data_added = true; + + if (range.is_range_delete) { + ASSERT_OK(writer->DeleteRange(range.start_key, range.end_key)); + } else { + for (size_t i = range.start; i != range.end;) { + auto key = FormatKey(i); + range.start < range.end ? i++ : i--; + ASSERT_OK(writer->Put(key, range.value)); + } + } + } + if (kVerbose) { + std::cout << "Ingested file: " + ingest_file + "; Range: {" << std::endl; + for (const auto& range : ranges_in_file) { + std::cout << " " << range.ToString() << "," << std::endl; + } + std::cout << "}" << std::endl; + } + if (data_added) { + ASSERT_OK(writer->Finish()); + } + } + + void RangeScan(std::unique_ptr& iter, + const std::vector& ranges, Slice& upper_bound, + std::vector>& result, + bool use_multi_scan) { + ASSERT_NE(iter, nullptr); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!ranges.empty()); + + MultiScanArgs scan_opts(options_.comparator); + std::unordered_map property_bag; + if (use_multi_scan) { + for (auto const& range : ranges) { + if (range.skipped) { + continue; + } + property_bag["count"] = std::to_string(range.scan_key_count_limit); + scan_opts.insert(range.start_key, range.end_key, property_bag); + // print range start end key + if (kVerbose) { + std::cout << "range start " << range.start_key << " end " + << range.end_key << std::endl; + } + } + iter->Prepare(scan_opts); + ASSERT_OK(iter->status()); + } + + for (auto const& range : ranges) { + if (range.skipped) { + continue; + } + size_t scan_key_count = 0; + if (kVerbose) { + std::cout << "seek key " << range.start_key << std::endl; + } + upper_bound = range.end_key; + for (iter->Seek(range.start_key); + iter->Valid() && scan_key_count < range.scan_key_count_limit; + iter->Next()) { + if (kVerbose) { + std::cout << "key " << iter->key().ToString() << " value " + << iter->value().ToString() << std::endl; + } + result.emplace_back(iter->key().ToString(), iter->value().ToString()); + scan_key_count++; + } + ASSERT_OK(iter->status()); + } + } + + void AddDataToRegularCF() { + for (auto const& ranges_in_level : ranges_in_levels_) { + for (auto const& range : ranges_in_level) { + if (!range.skipped) { + for (auto i = range.start; i != range.end; + range.start < range.end ? i++ : i--) { + if (range.is_range_delete) { + ASSERT_OK( + db_->Delete(WriteOptions(), regular_cfh_, FormatKey(i))); + } else { + ASSERT_OK(db_->Put(WriteOptions(), regular_cfh_, FormatKey(i), + range.value)); + } + } + } + } + } + ASSERT_OK(db_->Flush(FlushOptions(), regular_cfh_)); + } + + void ValidateQueryResult() { + // Query both CF with same range scan and validate result are same + for (auto i = 0; i < 200; i++) { + if (kVerbose) { + std::cout << "iteration " << i << std::endl; + } + SCOPED_TRACE("Iteration " + std::to_string(i)); + // randomly generate 1 to 3 ranges + auto ranges = GenerateKeyRanges(rnd.Uniform(3) + 4, 2, ""); + + // Query regular CF + std::vector> expected_result; + Slice upper_bound(""); + ReadOptions ro; + ro.iterate_upper_bound = &upper_bound; + + std::unique_ptr iter(db_->NewIterator(ro, regular_cfh_)); + ASSERT_NO_FATAL_FAILURE( + RangeScan(iter, ranges, upper_bound, expected_result, false)); + ASSERT_OK(iter->status()); + + // Query ingest CF + iter.reset(db_->NewIterator(ro, ingest_cfh_)); + std::vector> ingest_cf_result; + ASSERT_NO_FATAL_FAILURE( + RangeScan(iter, ranges, upper_bound, ingest_cf_result, false)); + + ASSERT_EQ(expected_result, ingest_cf_result); + ASSERT_OK(iter->status()); + + // Query ingest CF with UDI if it is enabled + if (enable_udi_) { + ro.table_index_factory = user_defined_index_factory_.get(); + } + + iter.reset(db_->NewIterator(ro, ingest_cfh_)); + std::vector> + ingest_cf_multi_scan_result; + ASSERT_NO_FATAL_FAILURE(RangeScan(iter, ranges, upper_bound, + ingest_cf_multi_scan_result, true)); + ASSERT_EQ(expected_result, ingest_cf_multi_scan_result); + ASSERT_OK(iter->status()); + } + } + + void IngestFilesInOneLevel(const std::vector& ranges_in_level, + const std::string& ingest_file_name_prefix, + size_t& ingest_file_count, + const IngestExternalFileOptions& ifo, + bool combine_ranges = false) { + // Generate SST file and bulk load them one level at a time + std::vector ingest_files; + if (combine_ranges) { + size_t i = 0; + while (i < ranges_in_level.size()) { + // if combine ranges, generate 1 SST file that combines muliple ranges + // together + // Randomly combine ranges to SST file. + size_t batch_end_idx = + std::min(i + rnd.Uniform(3) + 2, ranges_in_level.size()); + bool data_added = false; + ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges( + ingest_file_name_prefix + std::to_string(ingest_file_count), + {ranges_in_level.begin() + i, + ranges_in_level.begin() + batch_end_idx}, + data_added)); + if (data_added) { + ingest_files.push_back(ingest_file_name_prefix + + std::to_string(ingest_file_count)); + ingest_file_count++; + } + i = batch_end_idx; + } + } else { + for (auto const& range : ranges_in_level) { + if (!range.skipped) { + bool data_added = false; + ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges( + ingest_file_name_prefix + std::to_string(ingest_file_count), + {range}, data_added)); + ASSERT_TRUE(data_added); + ingest_files.push_back(ingest_file_name_prefix + + std::to_string(ingest_file_count)); + ingest_file_count++; + } + } + } + + ASSERT_OK(db_->IngestExternalFile(ingest_cfh_, ingest_files, ifo)); + } + + void IngestDataToCF() { + IngestExternalFileOptions ifo; + ifo.snapshot_consistency = false; + auto ingest_file_name_prefix = dbname_ + "ingest_file_"; + size_t ingest_file_count = 0; + for (auto const& ranges_in_level : ranges_in_levels_) { + ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel( + ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo)); + } + + ASSERT_GE(ingest_file_count, 0); + } + + void CompactIngestedCF() { + auto s = db_->CompactRange( + {.exclusive_manual_compaction = true, + .bottommost_level_compaction = BottommostLevelCompaction::kForce}, + ingest_cfh_, nullptr, nullptr); + ASSERT_OK(s); + } +}; + +TEST_P(UserDefinedIndexStressTest, PartialDeleteRange) { + // Create 2 column families. One use normal put/del, the other uses sst + // ingest Randomly generate multiple non overlapping range for multiple + // levels Range scan same range between the 2 CF and validate the result is + // same + SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_)); + dbname_ = + test::PerThreadDBPath("UserDefinedIndexStressTest_PartialDeleteRange"); + SCOPED_TRACE("dbname: " + dbname_); + ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_)); + + if (enable_udi_) { + // Skip UDI for now. + // The issue is that with UDI enabled, prepare might not prepare enough keys + // at lower level due to range delete from upper level. + // E.g. consider a LSM tree: + // L1: Data [0-1] + // L2: Delete Range [0-6] + // L3: Data [0-9] + // When multiscan queries range [0-9) with UDI count as 3, the L3 file + // will only prepare range [0-3). However, this range is masked out by upper + // layer delete range from [0-6] from L2. This causes query to only return + // [0,1], while [0,1,7] is the right result. Until prepare is able to + // preparing additional block supported, UDI is skipped. + return; + } + + for (int i = 0; i < 5; i++) { + ranges_in_levels_.push_back( + GenerateKeyRanges(rnd.Uniform(3) + 4, 2, + "L" + std::to_string(options_.num_levels - 1 - i))); + } + + ASSERT_NO_FATAL_FAILURE(IngestDataToCF()); + + if (enable_compaction_with_sst_partitioner_) { + ASSERT_NO_FATAL_FAILURE(CompactIngestedCF()); + } + + ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF()); + + ASSERT_NO_FATAL_FAILURE(ValidateQueryResult()); +} + +TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) { + // Create 2 column families. One use normal put/del, the other uses sst + // ingest. + // Test the case where there are 3 levels, the middle level is a delete + // range file that span across the entire key space. The top and bottom level + // file have multiple files and each one has both data and delete range. Scan + // same range between the 2 CF and validate the result is same + SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_)); + dbname_ = test::PerThreadDBPath( + "UserDefinedIndexStressTest_DeleteRangeMixedWithDataFile"); + SCOPED_TRACE("dbname: " + dbname_); + ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_)); + + // Test 3 levels. + // Bottom level is mixed data with delete range. + ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 6, 2, "L6")); + // Middle level delete range across entire key space. + if (is_reverse_comparator_) { + ranges_in_levels_.push_back({{.start = 100, + .end = 0, + .is_range_delete = true, + .skipped = false, + .start_key = "keyz", + .end_key = "key"}}); + } else { + ranges_in_levels_.push_back({{.start = 0, + .end = 100, + .is_range_delete = true, + .skipped = false, + .start_key = "key", + .end_key = "keyz"}}); + } + + // Top level is mixed data with delete range. + ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 6, 2, "L4")); + + IngestExternalFileOptions ifo; + ifo.snapshot_consistency = false; + auto ingest_file_name_prefix = dbname_ + "ingest_file_"; + size_t ingest_file_count = 0; + auto first_level = true; + for (auto const& ranges_in_level : ranges_in_levels_) { + ASSERT_NO_FATAL_FAILURE( + IngestFilesInOneLevel(ranges_in_level, ingest_file_name_prefix, + ingest_file_count, ifo, /*combine_ranges=*/true)); + if (first_level) { + first_level = false; + if (enable_compaction_with_sst_partitioner_) { + // When compaction is enabled, do a compaction at the first level + ASSERT_NO_FATAL_FAILURE(CompactIngestedCF()); + } + } + } + + ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF()); + + ASSERT_NO_FATAL_FAILURE(ValidateQueryResult()); +} + +TEST_P(UserDefinedIndexStressTest, DeleteRange) { + // Create 2 column families. One use normal put/del, the other uses sst + // ingest. + // Test the case where there are 3 levels, the middle level is a delete + // range file that span across the entire key space. Range scan same range + // between the 2 CF and validate the result is same + SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_)); + dbname_ = test::PerThreadDBPath("UserDefinedIndexStressTest_DeleteRange"); + SCOPED_TRACE("dbname: " + dbname_); + ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_)); + + // Test 3 levels. + // bottom level constains multiple files, each could have data or delete + // ranges or both. + ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6")); + // middle level delete range across entire key space + if (is_reverse_comparator_) { + ranges_in_levels_.push_back({{.start = 100, + .end = 0, + .is_range_delete = true, + .skipped = false, + .start_key = "keyz", + .end_key = "key"}}); + } else { + ranges_in_levels_.push_back({{.start = 0, + .end = 100, + .is_range_delete = true, + .skipped = false, + .start_key = "key", + .end_key = "keyz"}}); + } + // Top level constains multiple files, each could have data or delete + // ranges or both. + ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4")); + + IngestExternalFileOptions ifo; + ifo.snapshot_consistency = false; + auto ingest_file_name_prefix = dbname_ + "ingest_file_"; + size_t ingest_file_count = 0; + auto first_level = true; + for (auto const& ranges_in_level : ranges_in_levels_) { + ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel( + ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo)); + if (first_level) { + first_level = false; + if (enable_compaction_with_sst_partitioner_) { + // When compaction is enabled, do a compaction at the first level + ASSERT_NO_FATAL_FAILURE(CompactIngestedCF()); + } + } + } + + ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF()); + + ASSERT_NO_FATAL_FAILURE(ValidateQueryResult()); +} + +TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) { + // Create 2 column families. One use normal put/del, the other uses SST + // ingest. The SST ingest uses atomic range replace. + SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_)); + dbname_ = + test::PerThreadDBPath("UserDefinedIndexStressTest_AtomicReplaceBulkLoad"); + SCOPED_TRACE("dbname: " + dbname_); + ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_)); + + // Test 3 levels. + // bottom level constains multiple files, each could have data or delete + // ranges or both. + ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6")); + // middle level delete range across entire key space + if (is_reverse_comparator_) { + ranges_in_levels_.push_back({{.start = 100, + .end = 0, + .is_range_delete = true, + .skipped = false, + .start_key = "keyz", + .end_key = "key"}}); + } else { + ranges_in_levels_.push_back({{.start = 0, + .end = 100, + .is_range_delete = true, + .skipped = false, + .start_key = "key", + .end_key = "keyz"}}); + } + // Top level constains multiple files, each could have data or delete + // ranges or both. + ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4")); + + IngestExternalFileOptions ifo; + ifo.snapshot_consistency = false; + auto ingest_file_name_prefix = dbname_ + "ingest_file_"; + size_t ingest_file_count = 0; + auto first_level = true; + for (auto const& ranges_in_level : ranges_in_levels_) { + ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel( + ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo)); + if (first_level) { + first_level = false; + if (enable_compaction_with_sst_partitioner_) { + // When compaction is enabled, do a compaction at the first level + ASSERT_NO_FATAL_FAILURE(CompactIngestedCF()); + } + } + } + + // Ingest the a new file with atomic replace with full key space, this layer + // is exactly same as the one at the top level + bool data_added; + ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges( + ingest_file_name_prefix + std::to_string(++ingest_file_count), + ranges_in_levels_[2], data_added)); + + IngestExternalFileArg ingest_arg; + ingest_arg.column_family = ingest_cfh_; + ingest_arg.options = ifo; + ingest_arg.external_files.push_back(ingest_file_name_prefix + + std::to_string(ingest_file_count)); + ingest_arg.atomic_replace_range = RangeOpt(nullptr, nullptr); + + ASSERT_OK(db_->IngestExternalFiles( + std::vector({ingest_arg}))); + + ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF()); + + ASSERT_NO_FATAL_FAILURE(ValidateQueryResult()); +} + +INSTANTIATE_TEST_CASE_P( + UserDefinedIndexStressTest, UserDefinedIndexStressTest, + testing::Combine(testing::Values(BytewiseComparator(), + ReverseBytewiseComparator()), + testing::Bool(), testing::Bool())); +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + // Opt-in this whole test file + ROCKSDB_NAMESPACE::TEST_AllowUnsupportedFormatVersion() = true; -int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/table/unique_id.cc b/table/unique_id.cc index 8bfa8bcfd383..6da691082770 100644 --- a/table/unique_id.cc +++ b/table/unique_id.cc @@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE { std::string EncodeSessionId(uint64_t upper, uint64_t lower) { std::string db_session_id(20U, '\0'); - char *buf = db_session_id.data(); + char* buf = db_session_id.data(); // Preserving `lower` is slightly tricky. 36^12 is slightly more than // 62 bits, so we use 12 chars plus the bottom two bits of one more. // (A tiny fraction of 20 digit strings go unused.) @@ -26,8 +26,8 @@ std::string EncodeSessionId(uint64_t upper, uint64_t lower) { return db_session_id; } -Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, - uint64_t *lower) { +Status DecodeSessionId(const std::string& db_session_id, uint64_t* upper, + uint64_t* lower) { const size_t len = db_session_id.size(); if (len == 0) { return Status::NotSupported("Missing db_session_id"); @@ -41,7 +41,7 @@ Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, return Status::NotSupported("Too long db_session_id"); } uint64_t a = 0, b = 0; - const char *buf = &db_session_id.front(); + const char* buf = &db_session_id.front(); bool success = ParseBaseChars<36>(&buf, len - 12U, &a); if (!success) { return Status::NotSupported("Bad digit in db_session_id"); @@ -56,8 +56,8 @@ Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, return Status::OK(); } -Status GetSstInternalUniqueId(const std::string &db_id, - const std::string &db_session_id, +Status GetSstInternalUniqueId(const std::string& db_id, + const std::string& db_session_id, uint64_t file_number, UniqueIdPtr out, bool force) { if (!force) { @@ -160,11 +160,11 @@ std::string EncodeUniqueIdBytes(UniqueIdPtr in) { return ret; } -Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) { +Status DecodeUniqueIdBytes(const std::string& unique_id, UniqueIdPtr out) { if (unique_id.size() != (out.extended ? 24 : 16)) { return Status::NotSupported("Not a valid unique_id"); } - const char *buf = &unique_id.front(); + const char* buf = &unique_id.front(); out.ptr[0] = DecodeFixed64(&buf[0]); out.ptr[1] = DecodeFixed64(&buf[8]); if (out.extended) { @@ -174,8 +174,8 @@ Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) { } template -Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props, - std::string *out_id) { +Status GetUniqueIdFromTablePropertiesHelper(const TableProperties& props, + std::string* out_id) { ID tmp{}; Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id, props.orig_file_number, &tmp); @@ -188,23 +188,27 @@ Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props, return s; } -Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props, - std::string *out_id) { +Status GetExtendedUniqueIdFromTableProperties(const TableProperties& props, + std::string* out_id) { return GetUniqueIdFromTablePropertiesHelper(props, out_id); } -Status GetUniqueIdFromTableProperties(const TableProperties &props, - std::string *out_id) { +Status GetUniqueIdFromTableProperties(const TableProperties& props, + std::string* out_id) { return GetUniqueIdFromTablePropertiesHelper(props, out_id); } -std::string UniqueIdToHumanString(const std::string &id) { - // Not so efficient, but that's OK - std::string str = Slice(id).ToString(/*hex*/ true); - for (size_t i = 16; i < str.size(); i += 17) { - str.insert(i, "-"); +std::string UniqueIdToHumanString(const std::string& id) { + std::string hex = Slice(id).ToString(/*hex*/ true); + std::string result; + result.reserve(hex.size() + hex.size() / 16); + for (size_t i = 0; i < hex.size(); i++) { + if (i > 0 && i % 16 == 0) { + result.push_back('-'); + } + result.push_back(hex[i]); } - return str; + return result; } std::string InternalUniqueIdToHumanString(UniqueIdPtr in) { diff --git a/table/unique_id_impl.h b/table/unique_id_impl.h index 6e3dc62c794d..47d10c9712be 100644 --- a/table/unique_id_impl.h +++ b/table/unique_id_impl.h @@ -26,14 +26,14 @@ constexpr UniqueId64x3 kNullUniqueId64x3 = {}; // Dynamic pointer wrapper for one of the two above struct UniqueIdPtr { - uint64_t *ptr = nullptr; + uint64_t* ptr = nullptr; bool extended = false; - /*implicit*/ UniqueIdPtr(UniqueId64x2 *id) { + /*implicit*/ UniqueIdPtr(UniqueId64x2* id) { ptr = (*id).data(); extended = false; } - /*implicit*/ UniqueIdPtr(UniqueId64x3 *id) { + /*implicit*/ UniqueIdPtr(UniqueId64x3* id) { ptr = (*id).data(); extended = true; } @@ -45,8 +45,8 @@ struct UniqueIdPtr { // unique id, so can be manipulated in more ways but very carefully. // These must be long term stable to ensure GetUniqueIdFromTableProperties // is long term stable. -Status GetSstInternalUniqueId(const std::string &db_id, - const std::string &db_session_id, +Status GetSstInternalUniqueId(const std::string& db_id, + const std::string& db_session_id, uint64_t file_number, UniqueIdPtr out, bool force = false); @@ -66,7 +66,7 @@ void ExternalUniqueIdToInternal(UniqueIdPtr in_out); std::string EncodeUniqueIdBytes(UniqueIdPtr in); // Reverse of EncodeUniqueIdBytes. -Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out); +Status DecodeUniqueIdBytes(const std::string& unique_id, UniqueIdPtr out); // For presenting internal IDs for debugging purposes. Visually distinct from // UniqueIdToHumanString for external IDs. @@ -87,7 +87,7 @@ std::string EncodeSessionId(uint64_t upper, uint64_t lower); // Reverse of EncodeSessionId. Returns NotSupported on error rather than // Corruption because non-standard session IDs should be allowed with degraded // functionality. -Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, - uint64_t *lower); +Status DecodeSessionId(const std::string& db_session_id, uint64_t* upper, + uint64_t* lower); } // namespace ROCKSDB_NAMESPACE diff --git a/test_util/sync_point.cc b/test_util/sync_point.cc index bec02d4f67a3..2b9ab2f69625 100644 --- a/test_util/sync_point.cc +++ b/test_util/sync_point.cc @@ -79,4 +79,8 @@ void SetupSyncPointsToMockDirectIO() { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); #endif } + +#ifndef NDEBUG +std::atomic g_throw_on_testable_assertion_failure{0}; +#endif // NDEBUG } // namespace ROCKSDB_NAMESPACE diff --git a/test_util/sync_point.h b/test_util/sync_point.h index 6022073e573a..081e90cb1231 100644 --- a/test_util/sync_point.h +++ b/test_util/sync_point.h @@ -6,10 +6,9 @@ #include +#include #include -#include #include -#include #include #include "rocksdb/rocksdb_namespace.h" @@ -180,3 +179,37 @@ void SetupSyncPointsToMockDirectIO(); } \ } #endif // NDEBUG + +// An alternative to assert() that is more test-friendly than using +// ASSERT_DEATH. Relies on exception propagation. +#ifdef NDEBUG +#define testable_assert(cond) +#else +namespace ROCKSDB_NAMESPACE { +// Intentionally not based on std::exception to reduce places where this +// would be caught +struct TestableAssertionFailure {}; +// Tracks whether to throw on testable_assert failure instead of aborting. +// This is an atomic counter for re-entrancy / thread-safety. +extern std::atomic g_throw_on_testable_assertion_failure; +} // namespace ROCKSDB_NAMESPACE +#define testable_assert(cond) \ + do { \ + if (ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.load( \ + std::memory_order_relaxed) > 0) { \ + if (cond) { \ + } else \ + throw ROCKSDB_NAMESPACE::TestableAssertionFailure(); \ + } else { \ + assert(cond); \ + } \ + } while (0) // require ; in caller +#define ASSERT_TESTABLE_FAILURE(expr) \ + do { \ + ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_add( \ + 1, std::memory_order_relaxed); \ + ASSERT_THROW(expr, ROCKSDB_NAMESPACE::TestableAssertionFailure); \ + ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_sub( \ + 1, std::memory_order_relaxed); \ + } while (0) // require ; in caller +#endif diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 35884a7b3789..f9f9e0bf680a 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -29,6 +29,7 @@ #include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "util/random.h" +#include "util/string_util.h" #ifndef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} @@ -44,7 +45,7 @@ const std::set kFooterFormatVersionsToTest{ 6U, // In case any interesting future changes kDefaultFormatVersion, - kLatestFormatVersion, + kLatestBbtFormatVersion, }; const ReadOptionsNoIo kReadOptionsNoIo; @@ -91,9 +92,9 @@ bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode) { return test_mode != UserDefinedTimestampTestMode::kStripUserDefinedTimestamp; } -Slice CompressibleString(Random* rnd, double compressed_fraction, int len, +Slice CompressibleString(Random* rnd, double compressed_to_fraction, int len, std::string* dst) { - int raw = static_cast(len * compressed_fraction); + int raw = static_cast(len * compressed_to_fraction); if (raw < 1) { raw = 1; } @@ -311,7 +312,6 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) { db_opt->track_and_verify_wals = rnd->Uniform(2); db_opt->verify_sst_unique_id_in_manifest = rnd->Uniform(2); db_opt->skip_stats_update_on_db_open = rnd->Uniform(2); - db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2); db_opt->use_adaptive_mutex = rnd->Uniform(2); db_opt->use_fsync = rnd->Uniform(2); db_opt->recycle_log_file_num = rnd->Uniform(2); @@ -386,7 +386,6 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, cf_opt->level0_stop_writes_trigger = rnd->Uniform(100); cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100); cf_opt->max_write_buffer_number = rnd->Uniform(100); - cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100); cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000); cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100); cf_opt->num_levels = rnd->Uniform(100); diff --git a/test_util/testutil.h b/test_util/testutil.h index 2d693b5f201f..c07b0139a4d4 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -23,6 +23,7 @@ #include "rocksdb/slice.h" #include "rocksdb/table.h" #include "table/internal_iterator.h" +#include "util/defer.h" #include "util/mutexlock.h" #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS @@ -71,9 +72,16 @@ bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode); // Store in *dst a string of length "len" that will compress to // "N*compressed_fraction" bytes and return a Slice that references // the generated data. -Slice CompressibleString(Random* rnd, double compressed_fraction, int len, +Slice CompressibleString(Random* rnd, double compressed_to_fraction, int len, std::string* dst); +inline std::string CompressibleString(Random* rnd, + double compressed_to_fraction, int len) { + std::string dst; + CompressibleString(rnd, compressed_to_fraction, len, &dst); + return dst; +} + #ifndef NDEBUG // An internal comparator that just forward comparing results from the // user comparator in it. Can be used to test entities that have no dependency @@ -359,6 +367,11 @@ class StringSource : public FSRandomAccessFile { void set_total_reads(int tr) { total_reads_ = tr; } + IOStatus GetFileSize(uint64_t* file_size) override { + *file_size = contents_.size(); + return IOStatus::OK(); + } + private: std::string contents_; uint64_t uniq_id_; @@ -731,6 +744,149 @@ class StringFS : public FileSystemWrapper { std::unordered_map files_; }; +// A compressor that essentially implements a custom compression algorithm +// by leveraging an existing compression algorithm and putting a custom header +// on it to detect any attempts to decompress it with the wrong compression +// type or dictionary. +template +struct CompressorCustomAlg : public CompressorWrapper { + static bool Supported() { return LZ4_Supported(); } + + explicit CompressorCustomAlg( + std::unique_ptr wrapped = + GetBuiltinV2CompressionManager()->GetCompressor({}, kLZ4Compression)) + : CompressorWrapper(std::move(wrapped)), + dictionary_hash_(GetSliceHash(wrapped_->GetSerializedDict())) { + static_assert(kCompression > kLastBuiltinCompression); + } + + const char* Name() const override { return "CompressorCustomAlg"; } + + CompressionType GetPreferredCompressionType() const override { + return kCompression; + } + + std::unique_ptr Clone() const override { + return std::make_unique(wrapped_->Clone()); + } + + Status CompressBlock(Slice uncompressed_data, char* compressed_output, + size_t* compressed_output_size, + CompressionType* out_compression_type, + ManagedWorkingArea* working_area) override { + size_t allowed_output_size = *compressed_output_size; + Status s = wrapped_->CompressBlock(uncompressed_data, compressed_output, + compressed_output_size, + out_compression_type, working_area); + if (s.ok() && *out_compression_type != kNoCompression) { + assert(*out_compression_type == kLZ4Compression); + if (*compressed_output_size + 5 > allowed_output_size) { + *out_compression_type = kNoCompression; + return Status::OK(); + } + // Generate & insert header + std::memmove(compressed_output + 5, compressed_output, + *compressed_output_size); + compressed_output[0] = lossless_cast(kCompression); + EncodeFixed32(&compressed_output[1], dictionary_hash_); + *compressed_output_size += 5; + *out_compression_type = kCompression; + } + return s; + } + + std::unique_ptr MaybeCloneSpecialized( + CacheEntryRole block_type, DictConfigArgs&& dict_config) const override { + auto clone = + wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_config)); + return std::make_unique(std::move(clone)); + } + + protected: + uint32_t dictionary_hash_; +}; + +// A decompressor suitable for all the instantiable CompressorCustomAlg +// implementations. Can be configured to check that it is only used to +// decompress certain types using SetAllowedTypes(). +struct DecompressorCustomAlg : public DecompressorWrapper { + using TypeSet = SmallEnumSet; + + DecompressorCustomAlg(std::shared_ptr wrapped = + GetBuiltinV2CompressionManager()->GetDecompressor()) + : DecompressorWrapper(std::move(wrapped)), + dictionary_hash_(GetSliceHash(wrapped_->GetSerializedDict())), + allowed_types_(TypeSet::All()) {} + + const char* Name() const override { return "DecompressorCustomAlg"; } + + Status MaybeCloneForDict(const Slice& serialized_dict, + std::unique_ptr* out) override { + Status s = wrapped_->MaybeCloneForDict(serialized_dict, out); + if (s.ok()) { + assert(*out != nullptr); + auto clone = std::make_unique(std::move(*out)); + clone->SetAllowedTypes(allowed_types_); + *out = std::move(clone); + assert(out->get()->GetSerializedDict() == serialized_dict); + } else { + assert(*out == nullptr); + } + return s; + } + + Status ExtractUncompressedSize(Args& args) override { + if (args.compression_type >= kFirstCustomCompression && + args.compression_type <= kLastCustomCompression) { + assert(args.compressed_data.size() > 0); + assert(args.compressed_data[0] == + lossless_cast(args.compression_type)); + assert(DecodeFixed32(args.compressed_data.data() + 1) == + dictionary_hash_); + // Strip off our header because ExtractUncompressedSize() is also going + // to strip off the uncompressed size data. + args.compressed_data.remove_prefix(5); + // It's ok to modify other parts of args if we restore to original + SaveAndRestore save_compression_type( + &args.compression_type, kLZ4Compression); + return wrapped_->ExtractUncompressedSize(args); + } else { + // Also support built-in compressions + return wrapped_->ExtractUncompressedSize(args); + } + } + + Status DecompressBlock(const Args& args, char* uncompressed_output) override { + if (args.compression_type >= kFirstCustomCompression && + args.compression_type <= kLastCustomCompression) { + // Also allowed to copy args and modify + Args modified_args = args; + modified_args.compression_type = kLZ4Compression; + return wrapped_->DecompressBlock(modified_args, uncompressed_output); + } else { + // Also support built-in compressions + return wrapped_->DecompressBlock(args, uncompressed_output); + } + } + + void SetAllowedTypes(const CompressionType* types_begin, + const CompressionType* types_end) { + TypeSet allowed_types; + for (auto type = types_begin; type != types_end; ++type) { + allowed_types.Add(*type); + } + allowed_types_ = std::move(allowed_types); + } + + void SetAllowedTypes(TypeSet allowed_types) { + allowed_types_ = std::move(allowed_types); + } + + protected: + uint32_t dictionary_hash_; + SmallEnumSet allowed_types_; +}; + // Randomly initialize the given DBOptions void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc index b19c9f2a8115..f3c10c469daf 100644 --- a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc +++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc @@ -477,6 +477,38 @@ GTEST_DECLARE_bool_(death_test_use_fork); namespace internal { +template +AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, + const char* rhs_expression, + RawType lhs_value, RawType rhs_value) { + const FloatingPoint lhs(lhs_value), rhs(rhs_value); + + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + ::std::stringstream lhs_ss; + lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << lhs_value; + + ::std::stringstream rhs_ss; + rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << rhs_value; + + return EqFailure(lhs_expression, rhs_expression, + StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss), + false); +} + +template +AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, + const char* rhs_expression, + float lhs_value, float rhs_value); +template +AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, + const char* rhs_expression, + double lhs_value, double rhs_value); + // The value of GetTestTypeId() as seen from within the Google Test // library. This is solely for testing GetTestTypeId(). GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest.h b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h index 2d82d8e4d0b1..f6e3fabed005 100644 --- a/third-party/gtest-1.8.1/fused-src/gtest/gtest.h +++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h @@ -3973,7 +3973,7 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val); #include #include #include -#include +// #include // Not included in newer versions of gtest #include #include #include @@ -21451,27 +21451,7 @@ template AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, const char* rhs_expression, RawType lhs_value, - RawType rhs_value) { - const FloatingPoint lhs(lhs_value), rhs(rhs_value); - - if (lhs.AlmostEquals(rhs)) { - return AssertionSuccess(); - } - - ::std::stringstream lhs_ss; - lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << lhs_value; - - ::std::stringstream rhs_ss; - rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << rhs_value; - - return EqFailure(lhs_expression, - rhs_expression, - StringStreamToString(&lhs_ss), - StringStreamToString(&rhs_ss), - false); -} + RawType rhs_value); // Helper function for implementing ASSERT_NEAR. // diff --git a/tools/blob_dump.cc b/tools/blob_dump.cc index 23b5f8f7903a..520b194ee1a2 100644 --- a/tools/blob_dump.cc +++ b/tools/blob_dump.cc @@ -27,12 +27,10 @@ int main(int argc, char** argv) { {"file", required_argument, nullptr, 'f'}, {"show_key", optional_argument, nullptr, 'k'}, {"show_blob", optional_argument, nullptr, 'b'}, - {"show_uncompressed_blob", optional_argument, nullptr, 'r'}, {"show_summary", optional_argument, nullptr, 's'}, }; DisplayType show_key = DisplayType::kRaw; DisplayType show_blob = DisplayType::kNone; - DisplayType show_uncompressed_blob = DisplayType::kNone; bool show_summary = false; std::string file; while (true) { @@ -47,7 +45,6 @@ int main(int argc, char** argv) { "Usage: blob_dump --file=filename " "[--show_key[=none|raw|hex|detail]] " "[--show_blob[=none|raw|hex|detail]] " - "[--show_uncompressed_blob[=none|raw|hex|detail]] " "[--show_summary]\n"); return 0; case 'f': @@ -73,17 +70,6 @@ int main(int argc, char** argv) { show_blob = DisplayType::kHex; } break; - case 'r': - if (optarg) { - if (display_types.count(arg_str) == 0) { - fprintf(stderr, "Unrecognized blob display type.\n"); - return -1; - } - show_uncompressed_blob = display_types.at(arg_str); - } else { - show_uncompressed_blob = DisplayType::kHex; - } - break; case 's': show_summary = true; break; @@ -93,8 +79,7 @@ int main(int argc, char** argv) { } } BlobDumpTool tool; - Status s = - tool.Run(file, show_key, show_blob, show_uncompressed_blob, show_summary); + Status s = tool.Run(file, show_key, show_blob, show_summary); if (!s.ok()) { fprintf(stderr, "Failed: %s\n", s.ToString().c_str()); return -1; diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc index 77a6d1b2bb3b..146e1d5c174e 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -26,6 +26,7 @@ int main() { #include "test_util/testutil.h" #include "tools/block_cache_analyzer/block_cache_trace_analyzer.h" #include "trace_replay/block_cache_tracer.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -88,7 +89,7 @@ class BlockCacheTracerTest : public testing::Test { case 4: return TableReaderCaller::kUserIterator; } - // This cannot happend. + // This cannot happen. assert(false); return TableReaderCaller::kMaxBlockCacheLookupCaller; } diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index b137fcc2a922..44c513caf2f5 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -11,6 +11,8 @@ # Return value 0 means all regression tests pass. 1 if not pass. # # Environment options: +# SANITY_CHECK=1 - Do a syntax check and git checkout test as a sanity check +# that the script hasn't been broken by e.g. adding a new release wrongly. # SHORT_TEST=1 - Test only the oldest branch for each kind of test. This is # a good choice for PR validation as it is relatively fast and will find # most issues. @@ -135,7 +137,7 @@ EOF # To check for DB forward compatibility with loading options (old version # reading data from new), as well as backward compatibility -declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb") +declare -a db_forward_with_options_refs=("10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb" "10.10.fb" "10.11.fb") # To check for DB forward compatibility without loading options (in addition # to the "with loading options" set), as well as backward compatibility declare -a db_forward_no_options_refs=() # N/A at the moment @@ -143,7 +145,7 @@ declare -a db_forward_no_options_refs=() # N/A at the moment # To check for SST ingestion backward compatibility (new version reading # data from old) (ldb ingest_extern_sst added in 5.16.x, back-ported to # 5.14.x, 5.15.x) -declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb" "6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb") +declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb" "6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb" "8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb") # To check for SST ingestion forward compatibility (old version reading # data from new) as well as backward compatibility declare -a ext_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}") @@ -157,8 +159,9 @@ declare -a bak_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_wi # Branches (git refs) to check for DB backward compatibility (new version # reading data from old) (in addition to the "forward compatible" list) -# NOTE: 2.7.fb.branch shows assertion violation in some configurations -declare -a db_backward_only_refs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}") +# NOTE: format_version < 2 support was removed, so we only test back to 4.6.fb +# (when format_version=2 became the default) +declare -a db_backward_only_refs=("4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}") if [ "$SHORT_TEST" ]; then # Use only the first (if exists) of each list @@ -195,10 +198,14 @@ if [ "$SHORT_TEST" == "" ]; then done fi +invoke_make() +{ + [ "$SANITY_CHECK" ] || make "$@" +} generate_db() { set +e - bash "$script_copy_dir"/generate_random_db.sh "$1" "$2" + [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/generate_random_db.sh "$1" "$2" if [ $? -ne 0 ]; then echo ==== Error loading data from $2 to $1 ==== exit 1 @@ -209,7 +216,7 @@ generate_db() compare_db() { set +e - bash "$script_copy_dir"/verify_random_db.sh "$1" "$2" "$3" "$4" "$5" + [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/verify_random_db.sh "$1" "$2" "$3" "$4" "$5" if [ $? -ne 0 ]; then echo ==== Read different content from $1 and $2 or error happened. ==== exit 1 @@ -217,10 +224,21 @@ compare_db() set -e } +compact_db() +{ + set +e + [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/compact_db.sh "$1" "$2" "$3" + if [ $? -ne 0 ]; then + echo ==== Error compacting DB at $1 ==== + exit 1 + fi + set -e +} + write_external_sst() { set +e - bash "$script_copy_dir"/write_external_sst.sh "$1" "$2" "$3" + [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/write_external_sst.sh "$1" "$2" "$3" if [ $? -ne 0 ]; then echo ==== Error writing external SST file using data from $1 to $3 ==== exit 1 @@ -231,7 +249,7 @@ write_external_sst() ingest_external_sst() { set +e - bash "$script_copy_dir"/ingest_external_sst.sh "$1" "$2" + [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/ingest_external_sst.sh "$1" "$2" if [ $? -ne 0 ]; then echo ==== Error ingesting external SST in $2 to DB at $1 ==== exit 1 @@ -242,7 +260,7 @@ ingest_external_sst() backup_db() { set +e - bash "$script_copy_dir"/backup_db.sh "$1" "$2" + [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/backup_db.sh "$1" "$2" if [ $? -ne 0 ]; then echo ==== Error backing up DB $1 to $2 ==== exit 1 @@ -253,7 +271,7 @@ backup_db() restore_db() { set +e - bash "$script_copy_dir"/restore_db.sh "$1" "$2" + [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/restore_db.sh "$1" "$2" if [ $? -ne 0 ]; then echo ==== Error restoring from $1 to $2 ==== exit 1 @@ -297,8 +315,8 @@ current_checkout_name="$current_checkout_name ($current_checkout_hash)" echo "== Building $current_checkout_name debug" git checkout -B $tmp_branch $current_checkout_hash force_no_fbcode -make clean -DISABLE_WARNING_AS_ERROR=1 make ldb -j$J +invoke_make clean +DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J echo "== Using $current_checkout_name, generate DB with extern SST and ingest" current_ext_test_dir=$ext_test_dir"/current" @@ -318,8 +336,8 @@ do echo "== Building $checkout_ref debug" git reset --hard $tmp_origin/$checkout_ref force_no_fbcode - make clean - DISABLE_WARNING_AS_ERROR=1 make ldb -j$J + invoke_make clean + DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J # We currently assume DB backward compatibility for every branch listed echo "== Use $checkout_ref to generate a DB ..." @@ -349,6 +367,13 @@ do then echo "== Use $checkout_ref to open DB generated using $current_checkout_name..." compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0 + + echo "== Use $checkout_ref to compact a copy of DB generated using $current_checkout_name..." + [ "$SANITY_CHECK" ] || cp -a $current_db_test_dir ${current_db_test_dir}_copy_for_${checkout_ref} + compact_db ${current_db_test_dir}_copy_for_${checkout_ref} 0 + + echo "== After compaction, re-verify DB copy originally from $current_checkout_name..." + compare_db ${current_db_test_dir}_copy_for_${checkout_ref} $current_db_test_dir forward_${checkout_ref}_dump_after_compact.txt 0 fi if member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}" @@ -376,15 +401,21 @@ done echo "== Building $current_checkout_name debug (again, final)" git reset --hard $current_checkout_hash force_no_fbcode -make clean -DISABLE_WARNING_AS_ERROR=1 make ldb -j$J +invoke_make clean +DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J for checkout_ref in "${checkout_refs[@]}" do - # We currently assume DB backward compatibility for every branch listed + # We assume DB backward compatibility for every branch listed echo "== Use $current_checkout_name to open DB generated using $checkout_ref..." compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0 + echo "== Use $current_checkout_name to compact DB generated using $checkout_ref..." + compact_db $db_test_dir/$checkout_ref 1 0 + + echo "== After compaction, re-verify DB originally from $checkout_ref..." + compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump_after_compact.txt 1 0 + if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" || member_of_array "$checkout_ref" "${ext_forward_refs[@]}" then @@ -404,4 +435,8 @@ do fi done -echo ==== Compatibility Test PASSED ==== +if [ "$SANITY_CHECK" ]; then + echo "==== check_format_compatible.sh sanity check PASSED ====" +else + echo ==== Compatibility Test PASSED ==== +fi diff --git a/tools/compact_db.sh b/tools/compact_db.sh new file mode 100755 index 000000000000..8bcd95c0e906 --- /dev/null +++ b/tools/compact_db.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# A shell script to compact DB generated by generate_random_db.sh. +# ./ldb needs to be available to be executed. +# +# Usage: